1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 38 */ 39 40 #if defined(LIBC_SCCS) && !defined(lint) 41 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 42 #endif /* LIBC_SCCS and not lint */ 43 44 #include <sys/types.h> 45 #include <stdio.h> 46 #include <string.h> 47 #include <ctype.h> 48 #include <limits.h> 49 #include <stdlib.h> 50 #include <regex.h> 51 52 #include "utils.h" 53 #include "regex2.h" 54 55 #include "cclass.h" 56 #include "cname.h" 57 58 /* 59 * parse structure, passed up and down to avoid global variables and 60 * other clumsinesses 61 */ 62 struct parse { 63 char *next; /* next character in RE */ 64 char *end; /* end of string (-> NUL normally) */ 65 int error; /* has an error been seen? */ 66 sop *strip; /* malloced strip */ 67 sopno ssize; /* malloced strip size (allocated) */ 68 sopno slen; /* malloced strip length (used) */ 69 int ncsalloc; /* number of csets allocated */ 70 struct re_guts *g; 71 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 72 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 73 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 74 }; 75 76 /* ========= begin header generated by ./mkh ========= */ 77 #ifdef __cplusplus 78 extern "C" { 79 #endif 80 81 /* === regcomp.c === */ 82 static void p_ere __P((struct parse *p, int stop)); 83 static void p_ere_exp __P((struct parse *p)); 84 static void p_str __P((struct parse *p)); 85 static void p_bre __P((struct parse *p, int end1, int end2)); 86 static int p_simp_re __P((struct parse *p, int starordinary)); 87 static int p_count __P((struct parse *p)); 88 static void p_bracket __P((struct parse *p)); 89 static void p_b_term __P((struct parse *p, cset *cs)); 90 static void p_b_cclass __P((struct parse *p, cset *cs)); 91 static void p_b_eclass __P((struct parse *p, cset *cs)); 92 static char p_b_symbol __P((struct parse *p)); 93 static char p_b_coll_elem __P((struct parse *p, int endc)); 94 static char othercase __P((int ch)); 95 static void bothcases __P((struct parse *p, int ch)); 96 static void ordinary __P((struct parse *p, int ch)); 97 static void nonnewline __P((struct parse *p)); 98 static void repeat __P((struct parse *p, sopno start, int from, int to)); 99 static int seterr __P((struct parse *p, int e)); 100 static cset *allocset __P((struct parse *p)); 101 static void freeset __P((struct parse *p, cset *cs)); 102 static int freezeset __P((struct parse *p, cset *cs)); 103 static int firstch __P((struct parse *p, cset *cs)); 104 static int nch __P((struct parse *p, cset *cs)); 105 static void mcadd __P((struct parse *p, cset *cs, char *cp)); 106 static void mcsub __P((cset *cs, char *cp)); 107 static int mcin __P((cset *cs, char *cp)); 108 static char *mcfind __P((cset *cs, char *cp)); 109 static void mcinvert __P((struct parse *p, cset *cs)); 110 static void mccase __P((struct parse *p, cset *cs)); 111 static int isinsets __P((struct re_guts *g, int c)); 112 static int samesets __P((struct re_guts *g, int c1, int c2)); 113 static void categorize __P((struct parse *p, struct re_guts *g)); 114 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 115 static void doemit __P((struct parse *p, sop op, size_t opnd)); 116 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 117 static void dofwd __P((struct parse *p, sopno pos, sop value)); 118 static void enlarge __P((struct parse *p, sopno size)); 119 static void stripsnug __P((struct parse *p, struct re_guts *g)); 120 static void findmust __P((struct parse *p, struct re_guts *g)); 121 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 122 123 #ifdef __cplusplus 124 } 125 #endif 126 /* ========= end header generated by ./mkh ========= */ 127 128 static char nuls[10]; /* place to point scanner in event of error */ 129 130 /* 131 * macros for use with parse structure 132 * BEWARE: these know that the parse structure is named `p' !!! 133 */ 134 #define PEEK() (*p->next) 135 #define PEEK2() (*(p->next+1)) 136 #define MORE() (p->next < p->end) 137 #define MORE2() (p->next+1 < p->end) 138 #define SEE(c) (MORE() && PEEK() == (c)) 139 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 140 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 141 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 142 #define NEXT() (p->next++) 143 #define NEXT2() (p->next += 2) 144 #define NEXTn(n) (p->next += (n)) 145 #define GETNEXT() (*p->next++) 146 #define SETERROR(e) seterr(p, (e)) 147 #define REQUIRE(co, e) ((co) || SETERROR(e)) 148 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 149 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 150 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 151 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 152 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 153 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 154 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 155 #define HERE() (p->slen) 156 #define THERE() (p->slen - 1) 157 #define THERETHERE() (p->slen - 2) 158 #define DROP(n) (p->slen -= (n)) 159 160 #ifndef NDEBUG 161 static int never = 0; /* for use in asserts; shuts lint up */ 162 #else 163 #define never 0 /* some <assert.h>s have bugs too */ 164 #endif 165 166 /* 167 - regcomp - interface for parser and compilation 168 = extern int regcomp(regex_t *, const char *, int); 169 = #define REG_BASIC 0000 170 = #define REG_EXTENDED 0001 171 = #define REG_ICASE 0002 172 = #define REG_NOSUB 0004 173 = #define REG_NEWLINE 0010 174 = #define REG_NOSPEC 0020 175 = #define REG_PEND 0040 176 = #define REG_DUMP 0200 177 */ 178 int /* 0 success, otherwise REG_something */ 179 regcomp(preg, pattern, cflags) 180 regex_t *preg; 181 const char *pattern; 182 int cflags; 183 { 184 struct parse pa; 185 register struct re_guts *g; 186 register struct parse *p = &pa; 187 register int i; 188 register size_t len; 189 #ifdef REDEBUG 190 # define GOODFLAGS(f) (f) 191 #else 192 # define GOODFLAGS(f) ((f)&~REG_DUMP) 193 #endif 194 195 cflags = GOODFLAGS(cflags); 196 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 197 return(REG_INVARG); 198 199 if (cflags®_PEND) { 200 if (preg->re_endp < pattern) 201 return(REG_INVARG); 202 len = preg->re_endp - pattern; 203 } else 204 len = strlen((char *)pattern); 205 206 /* do the mallocs early so failure handling is easy */ 207 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 208 (NC-1)*sizeof(cat_t)); 209 if (g == NULL) 210 return(REG_ESPACE); 211 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 212 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 213 p->slen = 0; 214 if (p->strip == NULL) { 215 free((char *)g); 216 return(REG_ESPACE); 217 } 218 219 /* set things up */ 220 p->g = g; 221 p->next = (char *)pattern; /* convenience; we do not modify it */ 222 p->end = p->next + len; 223 p->error = 0; 224 p->ncsalloc = 0; 225 for (i = 0; i < NPAREN; i++) { 226 p->pbegin[i] = 0; 227 p->pend[i] = 0; 228 } 229 g->csetsize = NC; 230 g->sets = NULL; 231 g->setbits = NULL; 232 g->ncsets = 0; 233 g->cflags = cflags; 234 g->iflags = 0; 235 g->nbol = 0; 236 g->neol = 0; 237 g->must = NULL; 238 g->mlen = 0; 239 g->nsub = 0; 240 g->ncategories = 1; /* category 0 is "everything else" */ 241 g->categories = &g->catspace[-(CHAR_MIN)]; 242 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 243 g->backrefs = 0; 244 245 /* do it */ 246 EMIT(OEND, 0); 247 g->firststate = THERE(); 248 if (cflags®_EXTENDED) 249 p_ere(p, OUT); 250 else if (cflags®_NOSPEC) 251 p_str(p); 252 else 253 p_bre(p, OUT, OUT); 254 EMIT(OEND, 0); 255 g->laststate = THERE(); 256 257 /* tidy up loose ends and fill things in */ 258 categorize(p, g); 259 stripsnug(p, g); 260 findmust(p, g); 261 g->nplus = pluscount(p, g); 262 g->magic = MAGIC2; 263 preg->re_nsub = g->nsub; 264 preg->re_g = g; 265 preg->re_magic = MAGIC1; 266 #ifndef REDEBUG 267 /* not debugging, so can't rely on the assert() in regexec() */ 268 if (g->iflags&BAD) 269 SETERROR(REG_ASSERT); 270 #endif 271 272 /* win or lose, we're done */ 273 if (p->error != 0) /* lose */ 274 regfree(preg); 275 return(p->error); 276 } 277 278 /* 279 - p_ere - ERE parser top level, concatenation and alternation 280 == static void p_ere(register struct parse *p, int stop); 281 */ 282 static void 283 p_ere(p, stop) 284 register struct parse *p; 285 int stop; /* character this ERE should end at */ 286 { 287 register char c; 288 register sopno prevback; 289 register sopno prevfwd; 290 register sopno conc; 291 register int first = 1; /* is this the first alternative? */ 292 293 for (;;) { 294 /* do a bunch of concatenated expressions */ 295 conc = HERE(); 296 while (MORE() && (c = PEEK()) != '|' && c != stop) 297 p_ere_exp(p); 298 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 299 300 if (!EAT('|')) 301 break; /* NOTE BREAK OUT */ 302 303 if (first) { 304 INSERT(OCH_, conc); /* offset is wrong */ 305 prevfwd = conc; 306 prevback = conc; 307 first = 0; 308 } 309 ASTERN(OOR1, prevback); 310 prevback = THERE(); 311 AHEAD(prevfwd); /* fix previous offset */ 312 prevfwd = HERE(); 313 EMIT(OOR2, 0); /* offset is very wrong */ 314 } 315 316 if (!first) { /* tail-end fixups */ 317 AHEAD(prevfwd); 318 ASTERN(O_CH, prevback); 319 } 320 321 assert(!MORE() || SEE(stop)); 322 } 323 324 /* 325 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 326 == static void p_ere_exp(register struct parse *p); 327 */ 328 static void 329 p_ere_exp(p) 330 register struct parse *p; 331 { 332 register char c; 333 register sopno pos; 334 register int count; 335 register int count2; 336 register sopno subno; 337 int wascaret = 0; 338 339 assert(MORE()); /* caller should have ensured this */ 340 c = GETNEXT(); 341 342 pos = HERE(); 343 switch (c) { 344 case '(': 345 REQUIRE(MORE(), REG_EPAREN); 346 p->g->nsub++; 347 subno = p->g->nsub; 348 if (subno < NPAREN) 349 p->pbegin[subno] = HERE(); 350 EMIT(OLPAREN, subno); 351 if (!SEE(')')) 352 p_ere(p, ')'); 353 if (subno < NPAREN) { 354 p->pend[subno] = HERE(); 355 assert(p->pend[subno] != 0); 356 } 357 EMIT(ORPAREN, subno); 358 MUSTEAT(')', REG_EPAREN); 359 break; 360 #ifndef POSIX_MISTAKE 361 case ')': /* happens only if no current unmatched ( */ 362 /* 363 * You may ask, why the ifndef? Because I didn't notice 364 * this until slightly too late for 1003.2, and none of the 365 * other 1003.2 regular-expression reviewers noticed it at 366 * all. So an unmatched ) is legal POSIX, at least until 367 * we can get it fixed. 368 */ 369 SETERROR(REG_EPAREN); 370 break; 371 #endif 372 case '^': 373 EMIT(OBOL, 0); 374 p->g->iflags |= USEBOL; 375 p->g->nbol++; 376 wascaret = 1; 377 break; 378 case '$': 379 EMIT(OEOL, 0); 380 p->g->iflags |= USEEOL; 381 p->g->neol++; 382 break; 383 case '|': 384 SETERROR(REG_EMPTY); 385 break; 386 case '*': 387 case '+': 388 case '?': 389 SETERROR(REG_BADRPT); 390 break; 391 case '.': 392 if (p->g->cflags®_NEWLINE) 393 nonnewline(p); 394 else 395 EMIT(OANY, 0); 396 break; 397 case '[': 398 p_bracket(p); 399 break; 400 case '\\': 401 REQUIRE(MORE(), REG_EESCAPE); 402 c = GETNEXT(); 403 ordinary(p, c); 404 break; 405 case '{': /* okay as ordinary except if digit follows */ 406 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 407 /* FALLTHROUGH */ 408 default: 409 ordinary(p, c); 410 break; 411 } 412 413 if (!MORE()) 414 return; 415 c = PEEK(); 416 /* we call { a repetition if followed by a digit */ 417 if (!( c == '*' || c == '+' || c == '?' || 418 (c == '{' && MORE2() && isdigit(PEEK2())) )) 419 return; /* no repetition, we're done */ 420 NEXT(); 421 422 REQUIRE(!wascaret, REG_BADRPT); 423 switch (c) { 424 case '*': /* implemented as +? */ 425 /* this case does not require the (y|) trick, noKLUDGE */ 426 INSERT(OPLUS_, pos); 427 ASTERN(O_PLUS, pos); 428 INSERT(OQUEST_, pos); 429 ASTERN(O_QUEST, pos); 430 break; 431 case '+': 432 INSERT(OPLUS_, pos); 433 ASTERN(O_PLUS, pos); 434 break; 435 case '?': 436 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 437 INSERT(OCH_, pos); /* offset slightly wrong */ 438 ASTERN(OOR1, pos); /* this one's right */ 439 AHEAD(pos); /* fix the OCH_ */ 440 EMIT(OOR2, 0); /* offset very wrong... */ 441 AHEAD(THERE()); /* ...so fix it */ 442 ASTERN(O_CH, THERETHERE()); 443 break; 444 case '{': 445 count = p_count(p); 446 if (EAT(',')) { 447 if (isdigit(PEEK())) { 448 count2 = p_count(p); 449 REQUIRE(count <= count2, REG_BADBR); 450 } else /* single number with comma */ 451 count2 = INFINITY; 452 } else /* just a single number */ 453 count2 = count; 454 repeat(p, pos, count, count2); 455 if (!EAT('}')) { /* error heuristics */ 456 while (MORE() && PEEK() != '}') 457 NEXT(); 458 REQUIRE(MORE(), REG_EBRACE); 459 SETERROR(REG_BADBR); 460 } 461 break; 462 } 463 464 if (!MORE()) 465 return; 466 c = PEEK(); 467 if (!( c == '*' || c == '+' || c == '?' || 468 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 469 return; 470 SETERROR(REG_BADRPT); 471 } 472 473 /* 474 - p_str - string (no metacharacters) "parser" 475 == static void p_str(register struct parse *p); 476 */ 477 static void 478 p_str(p) 479 register struct parse *p; 480 { 481 REQUIRE(MORE(), REG_EMPTY); 482 while (MORE()) 483 ordinary(p, GETNEXT()); 484 } 485 486 /* 487 - p_bre - BRE parser top level, anchoring and concatenation 488 == static void p_bre(register struct parse *p, register int end1, \ 489 == register int end2); 490 * Giving end1 as OUT essentially eliminates the end1/end2 check. 491 * 492 * This implementation is a bit of a kludge, in that a trailing $ is first 493 * taken as an ordinary character and then revised to be an anchor. The 494 * only undesirable side effect is that '$' gets included as a character 495 * category in such cases. This is fairly harmless; not worth fixing. 496 * The amount of lookahead needed to avoid this kludge is excessive. 497 */ 498 static void 499 p_bre(p, end1, end2) 500 register struct parse *p; 501 register int end1; /* first terminating character */ 502 register int end2; /* second terminating character */ 503 { 504 register sopno start = HERE(); 505 register int first = 1; /* first subexpression? */ 506 register int wasdollar = 0; 507 508 if (EAT('^')) { 509 EMIT(OBOL, 0); 510 p->g->iflags |= USEBOL; 511 p->g->nbol++; 512 } 513 while (MORE() && !SEETWO(end1, end2)) { 514 wasdollar = p_simp_re(p, first); 515 first = 0; 516 } 517 if (wasdollar) { /* oops, that was a trailing anchor */ 518 DROP(1); 519 EMIT(OEOL, 0); 520 p->g->iflags |= USEEOL; 521 p->g->neol++; 522 } 523 524 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 525 } 526 527 /* 528 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 529 == static int p_simp_re(register struct parse *p, int starordinary); 530 */ 531 static int /* was the simple RE an unbackslashed $? */ 532 p_simp_re(p, starordinary) 533 register struct parse *p; 534 int starordinary; /* is a leading * an ordinary character? */ 535 { 536 register int c; 537 register int count; 538 register int count2; 539 register sopno pos; 540 register int i; 541 register sopno subno; 542 # define BACKSL (1<<CHAR_BIT) 543 544 pos = HERE(); /* repetion op, if any, covers from here */ 545 546 assert(MORE()); /* caller should have ensured this */ 547 c = GETNEXT(); 548 if (c == '\\') { 549 REQUIRE(MORE(), REG_EESCAPE); 550 c = BACKSL | (unsigned char)GETNEXT(); 551 } 552 switch (c) { 553 case '.': 554 if (p->g->cflags®_NEWLINE) 555 nonnewline(p); 556 else 557 EMIT(OANY, 0); 558 break; 559 case '[': 560 p_bracket(p); 561 break; 562 case BACKSL|'{': 563 SETERROR(REG_BADRPT); 564 break; 565 case BACKSL|'(': 566 p->g->nsub++; 567 subno = p->g->nsub; 568 if (subno < NPAREN) 569 p->pbegin[subno] = HERE(); 570 EMIT(OLPAREN, subno); 571 /* the MORE here is an error heuristic */ 572 if (MORE() && !SEETWO('\\', ')')) 573 p_bre(p, '\\', ')'); 574 if (subno < NPAREN) { 575 p->pend[subno] = HERE(); 576 assert(p->pend[subno] != 0); 577 } 578 EMIT(ORPAREN, subno); 579 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 580 break; 581 case BACKSL|')': /* should not get here -- must be user */ 582 case BACKSL|'}': 583 SETERROR(REG_EPAREN); 584 break; 585 case BACKSL|'1': 586 case BACKSL|'2': 587 case BACKSL|'3': 588 case BACKSL|'4': 589 case BACKSL|'5': 590 case BACKSL|'6': 591 case BACKSL|'7': 592 case BACKSL|'8': 593 case BACKSL|'9': 594 i = (c&~BACKSL) - '0'; 595 assert(i < NPAREN); 596 if (p->pend[i] != 0) { 597 assert(i <= p->g->nsub); 598 EMIT(OBACK_, i); 599 assert(p->pbegin[i] != 0); 600 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 601 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 602 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 603 EMIT(O_BACK, i); 604 } else 605 SETERROR(REG_ESUBREG); 606 p->g->backrefs = 1; 607 break; 608 case '*': 609 REQUIRE(starordinary, REG_BADRPT); 610 /* FALLTHROUGH */ 611 default: 612 ordinary(p, c &~ BACKSL); 613 break; 614 } 615 616 if (EAT('*')) { /* implemented as +? */ 617 /* this case does not require the (y|) trick, noKLUDGE */ 618 INSERT(OPLUS_, pos); 619 ASTERN(O_PLUS, pos); 620 INSERT(OQUEST_, pos); 621 ASTERN(O_QUEST, pos); 622 } else if (EATTWO('\\', '{')) { 623 count = p_count(p); 624 if (EAT(',')) { 625 if (MORE() && isdigit(PEEK())) { 626 count2 = p_count(p); 627 REQUIRE(count <= count2, REG_BADBR); 628 } else /* single number with comma */ 629 count2 = INFINITY; 630 } else /* just a single number */ 631 count2 = count; 632 repeat(p, pos, count, count2); 633 if (!EATTWO('\\', '}')) { /* error heuristics */ 634 while (MORE() && !SEETWO('\\', '}')) 635 NEXT(); 636 REQUIRE(MORE(), REG_EBRACE); 637 SETERROR(REG_BADBR); 638 } 639 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 640 return(1); 641 642 return(0); 643 } 644 645 /* 646 - p_count - parse a repetition count 647 == static int p_count(register struct parse *p); 648 */ 649 static int /* the value */ 650 p_count(p) 651 register struct parse *p; 652 { 653 register int count = 0; 654 register int ndigits = 0; 655 656 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 657 count = count*10 + (GETNEXT() - '0'); 658 ndigits++; 659 } 660 661 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 662 return(count); 663 } 664 665 /* 666 - p_bracket - parse a bracketed character list 667 == static void p_bracket(register struct parse *p); 668 * 669 * Note a significant property of this code: if the allocset() did SETERROR, 670 * no set operations are done. 671 */ 672 static void 673 p_bracket(p) 674 register struct parse *p; 675 { 676 register char c; 677 register cset *cs = allocset(p); 678 register int invert = 0; 679 680 /* Dept of Truly Sickening Special-Case Kludges */ 681 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 682 EMIT(OBOW, 0); 683 NEXTn(6); 684 return; 685 } 686 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 687 EMIT(OEOW, 0); 688 NEXTn(6); 689 return; 690 } 691 692 if (EAT('^')) 693 invert++; /* make note to invert set at end */ 694 if (EAT(']')) 695 CHadd(cs, ']'); 696 else if (EAT('-')) 697 CHadd(cs, '-'); 698 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 699 p_b_term(p, cs); 700 if (EAT('-')) 701 CHadd(cs, '-'); 702 MUSTEAT(']', REG_EBRACK); 703 704 if (p->error != 0) /* don't mess things up further */ 705 return; 706 707 if (p->g->cflags®_ICASE) { 708 register int i; 709 register int ci; 710 711 for (i = p->g->csetsize - 1; i >= 0; i--) 712 if (CHIN(cs, i) && isalpha(i)) { 713 ci = othercase(i); 714 if (ci != i) 715 CHadd(cs, ci); 716 } 717 if (cs->multis != NULL) 718 mccase(p, cs); 719 } 720 if (invert) { 721 register int i; 722 723 for (i = p->g->csetsize - 1; i >= 0; i--) 724 if (CHIN(cs, i)) 725 CHsub(cs, i); 726 else 727 CHadd(cs, i); 728 if (p->g->cflags®_NEWLINE) 729 CHsub(cs, '\n'); 730 if (cs->multis != NULL) 731 mcinvert(p, cs); 732 } 733 734 assert(cs->multis == NULL); /* xxx */ 735 736 if (nch(p, cs) == 1) { /* optimize singleton sets */ 737 ordinary(p, firstch(p, cs)); 738 freeset(p, cs); 739 } else 740 EMIT(OANYOF, freezeset(p, cs)); 741 } 742 743 /* 744 - p_b_term - parse one term of a bracketed character list 745 == static void p_b_term(register struct parse *p, register cset *cs); 746 */ 747 static void 748 p_b_term(p, cs) 749 register struct parse *p; 750 register cset *cs; 751 { 752 register char c; 753 register char start, finish; 754 register int i; 755 756 /* classify what we've got */ 757 switch ((MORE()) ? PEEK() : '\0') { 758 case '[': 759 c = (MORE2()) ? PEEK2() : '\0'; 760 break; 761 case '-': 762 SETERROR(REG_ERANGE); 763 return; /* NOTE RETURN */ 764 break; 765 default: 766 c = '\0'; 767 break; 768 } 769 770 switch (c) { 771 case ':': /* character class */ 772 NEXT2(); 773 REQUIRE(MORE(), REG_EBRACK); 774 c = PEEK(); 775 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 776 p_b_cclass(p, cs); 777 REQUIRE(MORE(), REG_EBRACK); 778 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 779 break; 780 case '=': /* equivalence class */ 781 NEXT2(); 782 REQUIRE(MORE(), REG_EBRACK); 783 c = PEEK(); 784 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 785 p_b_eclass(p, cs); 786 REQUIRE(MORE(), REG_EBRACK); 787 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 788 break; 789 default: /* symbol, ordinary character, or range */ 790 /* xxx revision needed for multichar stuff */ 791 start = p_b_symbol(p); 792 if (SEE('-') && MORE2() && PEEK2() != ']') { 793 /* range */ 794 NEXT(); 795 if (EAT('-')) 796 finish = '-'; 797 else 798 finish = p_b_symbol(p); 799 } else 800 finish = start; 801 /* xxx what about signed chars here... */ 802 REQUIRE(start <= finish, REG_ERANGE); 803 for (i = start; i <= finish; i++) 804 CHadd(cs, i); 805 break; 806 } 807 } 808 809 /* 810 - p_b_cclass - parse a character-class name and deal with it 811 == static void p_b_cclass(register struct parse *p, register cset *cs); 812 */ 813 static void 814 p_b_cclass(p, cs) 815 register struct parse *p; 816 register cset *cs; 817 { 818 register char *sp = p->next; 819 register struct cclass *cp; 820 register size_t len; 821 register char *u; 822 register char c; 823 824 while (MORE() && isalpha(PEEK())) 825 NEXT(); 826 len = p->next - sp; 827 for (cp = cclasses; cp->name != NULL; cp++) 828 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 829 break; 830 if (cp->name == NULL) { 831 /* oops, didn't find it */ 832 SETERROR(REG_ECTYPE); 833 return; 834 } 835 836 u = cp->chars; 837 while ((c = *u++) != '\0') 838 CHadd(cs, c); 839 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 840 MCadd(p, cs, u); 841 } 842 843 /* 844 - p_b_eclass - parse an equivalence-class name and deal with it 845 == static void p_b_eclass(register struct parse *p, register cset *cs); 846 * 847 * This implementation is incomplete. xxx 848 */ 849 static void 850 p_b_eclass(p, cs) 851 register struct parse *p; 852 register cset *cs; 853 { 854 register char c; 855 856 c = p_b_coll_elem(p, '='); 857 CHadd(cs, c); 858 } 859 860 /* 861 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 862 == static char p_b_symbol(register struct parse *p); 863 */ 864 static char /* value of symbol */ 865 p_b_symbol(p) 866 register struct parse *p; 867 { 868 register char value; 869 870 REQUIRE(MORE(), REG_EBRACK); 871 if (!EATTWO('[', '.')) 872 return(GETNEXT()); 873 874 /* collating symbol */ 875 value = p_b_coll_elem(p, '.'); 876 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 877 return(value); 878 } 879 880 /* 881 - p_b_coll_elem - parse a collating-element name and look it up 882 == static char p_b_coll_elem(register struct parse *p, int endc); 883 */ 884 static char /* value of collating element */ 885 p_b_coll_elem(p, endc) 886 register struct parse *p; 887 int endc; /* name ended by endc,']' */ 888 { 889 register char *sp = p->next; 890 register struct cname *cp; 891 register int len; 892 register char c; 893 894 while (MORE() && !SEETWO(endc, ']')) 895 NEXT(); 896 if (!MORE()) { 897 SETERROR(REG_EBRACK); 898 return(0); 899 } 900 len = p->next - sp; 901 for (cp = cnames; cp->name != NULL; cp++) 902 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 903 return(cp->code); /* known name */ 904 if (len == 1) 905 return(*sp); /* single character */ 906 SETERROR(REG_ECOLLATE); /* neither */ 907 return(0); 908 } 909 910 /* 911 - othercase - return the case counterpart of an alphabetic 912 == static char othercase(int ch); 913 */ 914 static char /* if no counterpart, return ch */ 915 othercase(ch) 916 int ch; 917 { 918 assert(isalpha(ch)); 919 if (isupper(ch)) 920 return(tolower(ch)); 921 else if (islower(ch)) 922 return(toupper(ch)); 923 else /* peculiar, but could happen */ 924 return(ch); 925 } 926 927 /* 928 - bothcases - emit a dualcase version of a two-case character 929 == static void bothcases(register struct parse *p, int ch); 930 * 931 * Boy, is this implementation ever a kludge... 932 */ 933 static void 934 bothcases(p, ch) 935 register struct parse *p; 936 int ch; 937 { 938 register char *oldnext = p->next; 939 register char *oldend = p->end; 940 char bracket[3]; 941 942 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 943 p->next = bracket; 944 p->end = bracket+2; 945 bracket[0] = ch; 946 bracket[1] = ']'; 947 bracket[2] = '\0'; 948 p_bracket(p); 949 assert(p->next == bracket+2); 950 p->next = oldnext; 951 p->end = oldend; 952 } 953 954 /* 955 - ordinary - emit an ordinary character 956 == static void ordinary(register struct parse *p, register int ch); 957 */ 958 static void 959 ordinary(p, ch) 960 register struct parse *p; 961 register int ch; 962 { 963 register cat_t *cap = p->g->categories; 964 965 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 966 bothcases(p, ch); 967 else { 968 EMIT(OCHAR, (unsigned char)ch); 969 if (cap[ch] == 0) 970 cap[ch] = p->g->ncategories++; 971 } 972 } 973 974 /* 975 - nonnewline - emit REG_NEWLINE version of OANY 976 == static void nonnewline(register struct parse *p); 977 * 978 * Boy, is this implementation ever a kludge... 979 */ 980 static void 981 nonnewline(p) 982 register struct parse *p; 983 { 984 register char *oldnext = p->next; 985 register char *oldend = p->end; 986 char bracket[4]; 987 988 p->next = bracket; 989 p->end = bracket+3; 990 bracket[0] = '^'; 991 bracket[1] = '\n'; 992 bracket[2] = ']'; 993 bracket[3] = '\0'; 994 p_bracket(p); 995 assert(p->next == bracket+3); 996 p->next = oldnext; 997 p->end = oldend; 998 } 999 1000 /* 1001 - repeat - generate code for a bounded repetition, recursively if needed 1002 == static void repeat(register struct parse *p, sopno start, int from, int to); 1003 */ 1004 static void 1005 repeat(p, start, from, to) 1006 register struct parse *p; 1007 sopno start; /* operand from here to end of strip */ 1008 int from; /* repeated from this number */ 1009 int to; /* to this number of times (maybe INFINITY) */ 1010 { 1011 register sopno finish = HERE(); 1012 # define N 2 1013 # define INF 3 1014 # define REP(f, t) ((f)*8 + (t)) 1015 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1016 register sopno copy; 1017 1018 if (p->error != 0) /* head off possible runaway recursion */ 1019 return; 1020 1021 assert(from <= to); 1022 1023 switch (REP(MAP(from), MAP(to))) { 1024 case REP(0, 0): /* must be user doing this */ 1025 DROP(finish-start); /* drop the operand */ 1026 break; 1027 case REP(0, 1): /* as x{1,1}? */ 1028 case REP(0, N): /* as x{1,n}? */ 1029 case REP(0, INF): /* as x{1,}? */ 1030 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1031 INSERT(OCH_, start); /* offset is wrong... */ 1032 repeat(p, start+1, 1, to); 1033 ASTERN(OOR1, start); 1034 AHEAD(start); /* ... fix it */ 1035 EMIT(OOR2, 0); 1036 AHEAD(THERE()); 1037 ASTERN(O_CH, THERETHERE()); 1038 break; 1039 case REP(1, 1): /* trivial case */ 1040 /* done */ 1041 break; 1042 case REP(1, N): /* as x?x{1,n-1} */ 1043 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1044 INSERT(OCH_, start); 1045 ASTERN(OOR1, start); 1046 AHEAD(start); 1047 EMIT(OOR2, 0); /* offset very wrong... */ 1048 AHEAD(THERE()); /* ...so fix it */ 1049 ASTERN(O_CH, THERETHERE()); 1050 copy = dupl(p, start+1, finish+1); 1051 assert(copy == finish+4); 1052 repeat(p, copy, 1, to-1); 1053 break; 1054 case REP(1, INF): /* as x+ */ 1055 INSERT(OPLUS_, start); 1056 ASTERN(O_PLUS, start); 1057 break; 1058 case REP(N, N): /* as xx{m-1,n-1} */ 1059 copy = dupl(p, start, finish); 1060 repeat(p, copy, from-1, to-1); 1061 break; 1062 case REP(N, INF): /* as xx{n-1,INF} */ 1063 copy = dupl(p, start, finish); 1064 repeat(p, copy, from-1, to); 1065 break; 1066 default: /* "can't happen" */ 1067 SETERROR(REG_ASSERT); /* just in case */ 1068 break; 1069 } 1070 } 1071 1072 /* 1073 - seterr - set an error condition 1074 == static int seterr(register struct parse *p, int e); 1075 */ 1076 static int /* useless but makes type checking happy */ 1077 seterr(p, e) 1078 register struct parse *p; 1079 int e; 1080 { 1081 if (p->error == 0) /* keep earliest error condition */ 1082 p->error = e; 1083 p->next = nuls; /* try to bring things to a halt */ 1084 p->end = nuls; 1085 return(0); /* make the return value well-defined */ 1086 } 1087 1088 /* 1089 - allocset - allocate a set of characters for [] 1090 == static cset *allocset(register struct parse *p); 1091 */ 1092 static cset * 1093 allocset(p) 1094 register struct parse *p; 1095 { 1096 register int no = p->g->ncsets++; 1097 register size_t nc; 1098 register size_t nbytes; 1099 register cset *cs; 1100 register size_t css = (size_t)p->g->csetsize; 1101 register int i; 1102 1103 if (no >= p->ncsalloc) { /* need another column of space */ 1104 p->ncsalloc += CHAR_BIT; 1105 nc = p->ncsalloc; 1106 assert(nc % CHAR_BIT == 0); 1107 nbytes = nc / CHAR_BIT * css; 1108 if (p->g->sets == NULL) 1109 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1110 else 1111 p->g->sets = (cset *)realloc((char *)p->g->sets, 1112 nc * sizeof(cset)); 1113 if (p->g->setbits == NULL) 1114 p->g->setbits = (uch *)malloc(nbytes); 1115 else { 1116 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1117 nbytes); 1118 /* xxx this isn't right if setbits is now NULL */ 1119 for (i = 0; i < no; i++) 1120 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1121 } 1122 if (p->g->sets != NULL && p->g->setbits != NULL) 1123 (void) memset((char *)p->g->setbits + (nbytes - css), 1124 0, css); 1125 else { 1126 no = 0; 1127 SETERROR(REG_ESPACE); 1128 /* caller's responsibility not to do set ops */ 1129 } 1130 } 1131 1132 assert(p->g->sets != NULL); /* xxx */ 1133 cs = &p->g->sets[no]; 1134 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1135 cs->mask = 1 << ((no) % CHAR_BIT); 1136 cs->hash = 0; 1137 cs->smultis = 0; 1138 cs->multis = NULL; 1139 1140 return(cs); 1141 } 1142 1143 /* 1144 - freeset - free a now-unused set 1145 == static void freeset(register struct parse *p, register cset *cs); 1146 */ 1147 static void 1148 freeset(p, cs) 1149 register struct parse *p; 1150 register cset *cs; 1151 { 1152 register int i; 1153 register cset *top = &p->g->sets[p->g->ncsets]; 1154 register size_t css = (size_t)p->g->csetsize; 1155 1156 for (i = 0; i < css; i++) 1157 CHsub(cs, i); 1158 if (cs == top-1) /* recover only the easy case */ 1159 p->g->ncsets--; 1160 } 1161 1162 /* 1163 - freezeset - final processing on a set of characters 1164 == static int freezeset(register struct parse *p, register cset *cs); 1165 * 1166 * The main task here is merging identical sets. This is usually a waste 1167 * of time (although the hash code minimizes the overhead), but can win 1168 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1169 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1170 * the same value! 1171 */ 1172 static int /* set number */ 1173 freezeset(p, cs) 1174 register struct parse *p; 1175 register cset *cs; 1176 { 1177 register uch h = cs->hash; 1178 register int i; 1179 register cset *top = &p->g->sets[p->g->ncsets]; 1180 register cset *cs2; 1181 register size_t css = (size_t)p->g->csetsize; 1182 1183 /* look for an earlier one which is the same */ 1184 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1185 if (cs2->hash == h && cs2 != cs) { 1186 /* maybe */ 1187 for (i = 0; i < css; i++) 1188 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1189 break; /* no */ 1190 if (i == css) 1191 break; /* yes */ 1192 } 1193 1194 if (cs2 < top) { /* found one */ 1195 freeset(p, cs); 1196 cs = cs2; 1197 } 1198 1199 return((int)(cs - p->g->sets)); 1200 } 1201 1202 /* 1203 - firstch - return first character in a set (which must have at least one) 1204 == static int firstch(register struct parse *p, register cset *cs); 1205 */ 1206 static int /* character; there is no "none" value */ 1207 firstch(p, cs) 1208 register struct parse *p; 1209 register cset *cs; 1210 { 1211 register int i; 1212 register size_t css = (size_t)p->g->csetsize; 1213 1214 for (i = 0; i < css; i++) 1215 if (CHIN(cs, i)) 1216 return((char)i); 1217 assert(never); 1218 return(0); /* arbitrary */ 1219 } 1220 1221 /* 1222 - nch - number of characters in a set 1223 == static int nch(register struct parse *p, register cset *cs); 1224 */ 1225 static int 1226 nch(p, cs) 1227 register struct parse *p; 1228 register cset *cs; 1229 { 1230 register int i; 1231 register size_t css = (size_t)p->g->csetsize; 1232 register int n = 0; 1233 1234 for (i = 0; i < css; i++) 1235 if (CHIN(cs, i)) 1236 n++; 1237 return(n); 1238 } 1239 1240 /* 1241 - mcadd - add a collating element to a cset 1242 == static void mcadd(register struct parse *p, register cset *cs, \ 1243 == register char *cp); 1244 */ 1245 static void 1246 mcadd(p, cs, cp) 1247 register struct parse *p; 1248 register cset *cs; 1249 register char *cp; 1250 { 1251 register size_t oldend = cs->smultis; 1252 1253 cs->smultis += strlen(cp) + 1; 1254 if (cs->multis == NULL) 1255 cs->multis = malloc(cs->smultis); 1256 else 1257 cs->multis = realloc(cs->multis, cs->smultis); 1258 if (cs->multis == NULL) { 1259 SETERROR(REG_ESPACE); 1260 return; 1261 } 1262 1263 (void) strcpy(cs->multis + oldend - 1, cp); 1264 cs->multis[cs->smultis - 1] = '\0'; 1265 } 1266 1267 /* 1268 - mcsub - subtract a collating element from a cset 1269 == static void mcsub(register cset *cs, register char *cp); 1270 */ 1271 static void 1272 mcsub(cs, cp) 1273 register cset *cs; 1274 register char *cp; 1275 { 1276 register char *fp = mcfind(cs, cp); 1277 register size_t len = strlen(fp); 1278 1279 assert(fp != NULL); 1280 (void) memmove(fp, fp + len + 1, 1281 cs->smultis - (fp + len + 1 - cs->multis)); 1282 cs->smultis -= len; 1283 1284 if (cs->smultis == 0) { 1285 free(cs->multis); 1286 cs->multis = NULL; 1287 return; 1288 } 1289 1290 cs->multis = realloc(cs->multis, cs->smultis); 1291 assert(cs->multis != NULL); 1292 } 1293 1294 /* 1295 - mcin - is a collating element in a cset? 1296 == static int mcin(register cset *cs, register char *cp); 1297 */ 1298 static int 1299 mcin(cs, cp) 1300 register cset *cs; 1301 register char *cp; 1302 { 1303 return(mcfind(cs, cp) != NULL); 1304 } 1305 1306 /* 1307 - mcfind - find a collating element in a cset 1308 == static char *mcfind(register cset *cs, register char *cp); 1309 */ 1310 static char * 1311 mcfind(cs, cp) 1312 register cset *cs; 1313 register char *cp; 1314 { 1315 register char *p; 1316 1317 if (cs->multis == NULL) 1318 return(NULL); 1319 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1320 if (strcmp(cp, p) == 0) 1321 return(p); 1322 return(NULL); 1323 } 1324 1325 /* 1326 - mcinvert - invert the list of collating elements in a cset 1327 == static void mcinvert(register struct parse *p, register cset *cs); 1328 * 1329 * This would have to know the set of possibilities. Implementation 1330 * is deferred. 1331 */ 1332 static void 1333 mcinvert(p, cs) 1334 register struct parse *p; 1335 register cset *cs; 1336 { 1337 assert(cs->multis == NULL); /* xxx */ 1338 } 1339 1340 /* 1341 - mccase - add case counterparts of the list of collating elements in a cset 1342 == static void mccase(register struct parse *p, register cset *cs); 1343 * 1344 * This would have to know the set of possibilities. Implementation 1345 * is deferred. 1346 */ 1347 static void 1348 mccase(p, cs) 1349 register struct parse *p; 1350 register cset *cs; 1351 { 1352 assert(cs->multis == NULL); /* xxx */ 1353 } 1354 1355 /* 1356 - isinsets - is this character in any sets? 1357 == static int isinsets(register struct re_guts *g, int c); 1358 */ 1359 static int /* predicate */ 1360 isinsets(g, c) 1361 register struct re_guts *g; 1362 int c; 1363 { 1364 register uch *col; 1365 register int i; 1366 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1367 register unsigned uc = (unsigned char)c; 1368 1369 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1370 if (col[uc] != 0) 1371 return(1); 1372 return(0); 1373 } 1374 1375 /* 1376 - samesets - are these two characters in exactly the same sets? 1377 == static int samesets(register struct re_guts *g, int c1, int c2); 1378 */ 1379 static int /* predicate */ 1380 samesets(g, c1, c2) 1381 register struct re_guts *g; 1382 int c1; 1383 int c2; 1384 { 1385 register uch *col; 1386 register int i; 1387 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1388 register unsigned uc1 = (unsigned char)c1; 1389 register unsigned uc2 = (unsigned char)c2; 1390 1391 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1392 if (col[uc1] != col[uc2]) 1393 return(0); 1394 return(1); 1395 } 1396 1397 /* 1398 - categorize - sort out character categories 1399 == static void categorize(struct parse *p, register struct re_guts *g); 1400 */ 1401 static void 1402 categorize(p, g) 1403 struct parse *p; 1404 register struct re_guts *g; 1405 { 1406 register cat_t *cats = g->categories; 1407 register int c; 1408 register int c2; 1409 register cat_t cat; 1410 1411 /* avoid making error situations worse */ 1412 if (p->error != 0) 1413 return; 1414 1415 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1416 if (cats[c] == 0 && isinsets(g, c)) { 1417 cat = g->ncategories++; 1418 cats[c] = cat; 1419 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1420 if (cats[c2] == 0 && samesets(g, c, c2)) 1421 cats[c2] = cat; 1422 } 1423 } 1424 1425 /* 1426 - dupl - emit a duplicate of a bunch of sops 1427 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1428 */ 1429 static sopno /* start of duplicate */ 1430 dupl(p, start, finish) 1431 register struct parse *p; 1432 sopno start; /* from here */ 1433 sopno finish; /* to this less one */ 1434 { 1435 register sopno ret = HERE(); 1436 register sopno len = finish - start; 1437 1438 assert(finish >= start); 1439 if (len == 0) 1440 return(ret); 1441 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1442 assert(p->ssize >= p->slen + len); 1443 (void) memcpy((char *)(p->strip + p->slen), 1444 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1445 p->slen += len; 1446 return(ret); 1447 } 1448 1449 /* 1450 - doemit - emit a strip operator 1451 == static void doemit(register struct parse *p, sop op, size_t opnd); 1452 * 1453 * It might seem better to implement this as a macro with a function as 1454 * hard-case backup, but it's just too big and messy unless there are 1455 * some changes to the data structures. Maybe later. 1456 */ 1457 static void 1458 doemit(p, op, opnd) 1459 register struct parse *p; 1460 sop op; 1461 size_t opnd; 1462 { 1463 /* avoid making error situations worse */ 1464 if (p->error != 0) 1465 return; 1466 1467 /* deal with oversize operands ("can't happen", more or less) */ 1468 assert(opnd < 1<<OPSHIFT); 1469 1470 /* deal with undersized strip */ 1471 if (p->slen >= p->ssize) 1472 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1473 assert(p->slen < p->ssize); 1474 1475 /* finally, it's all reduced to the easy case */ 1476 p->strip[p->slen++] = SOP(op, opnd); 1477 } 1478 1479 /* 1480 - doinsert - insert a sop into the strip 1481 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1482 */ 1483 static void 1484 doinsert(p, op, opnd, pos) 1485 register struct parse *p; 1486 sop op; 1487 size_t opnd; 1488 sopno pos; 1489 { 1490 register sopno sn; 1491 register sop s; 1492 register int i; 1493 1494 /* avoid making error situations worse */ 1495 if (p->error != 0) 1496 return; 1497 1498 sn = HERE(); 1499 EMIT(op, opnd); /* do checks, ensure space */ 1500 assert(HERE() == sn+1); 1501 s = p->strip[sn]; 1502 1503 /* adjust paren pointers */ 1504 assert(pos > 0); 1505 for (i = 1; i < NPAREN; i++) { 1506 if (p->pbegin[i] >= pos) { 1507 p->pbegin[i]++; 1508 } 1509 if (p->pend[i] >= pos) { 1510 p->pend[i]++; 1511 } 1512 } 1513 1514 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1515 (HERE()-pos-1)*sizeof(sop)); 1516 p->strip[pos] = s; 1517 } 1518 1519 /* 1520 - dofwd - complete a forward reference 1521 == static void dofwd(register struct parse *p, sopno pos, sop value); 1522 */ 1523 static void 1524 dofwd(p, pos, value) 1525 register struct parse *p; 1526 register sopno pos; 1527 sop value; 1528 { 1529 /* avoid making error situations worse */ 1530 if (p->error != 0) 1531 return; 1532 1533 assert(value < 1<<OPSHIFT); 1534 p->strip[pos] = OP(p->strip[pos]) | value; 1535 } 1536 1537 /* 1538 - enlarge - enlarge the strip 1539 == static void enlarge(register struct parse *p, sopno size); 1540 */ 1541 static void 1542 enlarge(p, size) 1543 register struct parse *p; 1544 register sopno size; 1545 { 1546 register sop *sp; 1547 1548 if (p->ssize >= size) 1549 return; 1550 1551 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1552 if (sp == NULL) { 1553 SETERROR(REG_ESPACE); 1554 return; 1555 } 1556 p->strip = sp; 1557 p->ssize = size; 1558 } 1559 1560 /* 1561 - stripsnug - compact the strip 1562 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1563 */ 1564 static void 1565 stripsnug(p, g) 1566 register struct parse *p; 1567 register struct re_guts *g; 1568 { 1569 g->nstates = p->slen; 1570 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1571 if (g->strip == NULL) { 1572 SETERROR(REG_ESPACE); 1573 g->strip = p->strip; 1574 } 1575 } 1576 1577 /* 1578 - findmust - fill in must and mlen with longest mandatory literal string 1579 == static void findmust(register struct parse *p, register struct re_guts *g); 1580 * 1581 * This algorithm could do fancy things like analyzing the operands of | 1582 * for common subsequences. Someday. This code is simple and finds most 1583 * of the interesting cases. 1584 * 1585 * Note that must and mlen got initialized during setup. 1586 */ 1587 static void 1588 findmust(p, g) 1589 struct parse *p; 1590 register struct re_guts *g; 1591 { 1592 register sop *scan; 1593 sop *start; 1594 register sop *newstart; 1595 register sopno newlen; 1596 register sop s; 1597 register char *cp; 1598 register sopno i; 1599 1600 /* avoid making error situations worse */ 1601 if (p->error != 0) 1602 return; 1603 1604 /* find the longest OCHAR sequence in strip */ 1605 newlen = 0; 1606 scan = g->strip + 1; 1607 do { 1608 s = *scan++; 1609 switch (OP(s)) { 1610 case OCHAR: /* sequence member */ 1611 if (newlen == 0) /* new sequence */ 1612 newstart = scan - 1; 1613 newlen++; 1614 break; 1615 case OPLUS_: /* things that don't break one */ 1616 case OLPAREN: 1617 case ORPAREN: 1618 break; 1619 case OQUEST_: /* things that must be skipped */ 1620 case OCH_: 1621 scan--; 1622 do { 1623 scan += OPND(s); 1624 s = *scan; 1625 /* assert() interferes w debug printouts */ 1626 if (OP(s) != O_QUEST && OP(s) != O_CH && 1627 OP(s) != OOR2) { 1628 g->iflags |= BAD; 1629 return; 1630 } 1631 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1632 /* fallthrough */ 1633 default: /* things that break a sequence */ 1634 if (newlen > g->mlen) { /* ends one */ 1635 start = newstart; 1636 g->mlen = newlen; 1637 } 1638 newlen = 0; 1639 break; 1640 } 1641 } while (OP(s) != OEND); 1642 1643 if (g->mlen == 0) /* there isn't one */ 1644 return; 1645 1646 /* turn it into a character string */ 1647 g->must = malloc((size_t)g->mlen + 1); 1648 if (g->must == NULL) { /* argh; just forget it */ 1649 g->mlen = 0; 1650 return; 1651 } 1652 cp = g->must; 1653 scan = start; 1654 for (i = g->mlen; i > 0; i--) { 1655 while (OP(s = *scan++) != OCHAR) 1656 continue; 1657 assert(cp < g->must + g->mlen); 1658 *cp++ = (char)OPND(s); 1659 } 1660 assert(cp == g->must + g->mlen); 1661 *cp++ = '\0'; /* just on general principles */ 1662 } 1663 1664 /* 1665 - pluscount - count + nesting 1666 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1667 */ 1668 static sopno /* nesting depth */ 1669 pluscount(p, g) 1670 struct parse *p; 1671 register struct re_guts *g; 1672 { 1673 register sop *scan; 1674 register sop s; 1675 register sopno plusnest = 0; 1676 register sopno maxnest = 0; 1677 1678 if (p->error != 0) 1679 return(0); /* there may not be an OEND */ 1680 1681 scan = g->strip + 1; 1682 do { 1683 s = *scan++; 1684 switch (OP(s)) { 1685 case OPLUS_: 1686 plusnest++; 1687 break; 1688 case O_PLUS: 1689 if (plusnest > maxnest) 1690 maxnest = plusnest; 1691 plusnest--; 1692 break; 1693 } 1694 } while (OP(s) != OEND); 1695 if (plusnest != 0) 1696 g->iflags |= BAD; 1697 return(maxnest); 1698 } 1699