1 /* $NetBSD: regcomp.c,v 1.7 2011/11/19 17:45:11 tnozaki Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer of the University of Toronto. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)regcomp.c 8.4 (Berkeley) 3/19/94 36 */ 37 38 #if defined(LIBC_SCCS) && !defined(lint) 39 static char sccsid[] = "@(#)regcomp.c 8.4 (Berkeley) 3/19/94"; 40 #endif /* LIBC_SCCS and not lint */ 41 42 #include <sys/types.h> 43 #include <stdio.h> 44 #include <string.h> 45 #include <ctype.h> 46 #include <limits.h> 47 #include <stdlib.h> 48 #include <regex.h> 49 50 #include "utils.h" 51 #include "regex2.h" 52 53 #include "cclass.h" 54 #include "cname.h" 55 56 /* 57 * parse structure, passed up and down to avoid global variables and 58 * other clumsinesses 59 */ 60 struct parse { 61 RCHAR_T *next; /* next character in RE */ 62 RCHAR_T *end; /* end of string (-> NUL normally) */ 63 int error; /* has an error been seen? */ 64 sop *strip; /* malloced strip */ 65 RCHAR_T *stripdata; /* malloced stripdata */ 66 sopno ssize; /* malloced strip size (allocated) */ 67 sopno slen; /* malloced strip length (used) */ 68 int ncsalloc; /* number of csets allocated */ 69 struct re_guts *g; 70 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 71 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 72 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 73 }; 74 75 /* ========= begin header generated by ./mkh ========= */ 76 #ifdef __cplusplus 77 extern "C" { 78 #endif 79 80 /* === regcomp.c === */ 81 static void p_ere(struct parse *p, int stop, size_t reclimit); 82 static void p_ere_exp(struct parse *p, size_t reclimit); 83 static void p_str(struct parse *p); 84 static void p_bre(struct parse *p, int end1, int end2, size_t reclimit); 85 static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 86 static int p_count(struct parse *p); 87 static void p_bracket(struct parse *p); 88 static void p_b_term(struct parse *p, cset *cs); 89 static void p_b_cclass(struct parse *p, cset *cs); 90 static void p_b_eclass(struct parse *p, cset *cs); 91 static char p_b_symbol(struct parse *p); 92 static char p_b_coll_elem(struct parse *p, int endc); 93 static char othercase(int ch); 94 static void bothcases(struct parse *p, int ch); 95 static void ordinary(struct parse *p, int ch); 96 static void nonnewline(struct parse *p); 97 static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit); 98 static int seterr(struct parse *p, int e); 99 static cset *allocset(struct parse *p); 100 static void freeset(struct parse *p, cset *cs); 101 static int freezeset(struct parse *p, cset *cs); 102 static int firstch(struct parse *p, cset *cs); 103 static int nch(struct parse *p, cset *cs); 104 static void mcadd(struct parse *p, cset *cs, const char *cp); 105 #ifdef notdef 106 static void mcsub(cset *cs, char *cp); 107 static int mcin(cset *cs, char *cp); 108 static char *mcfind(cset *cs, char *cp); 109 #endif 110 static void mcinvert(struct parse *p, cset *cs); 111 static void mccase(struct parse *p, cset *cs); 112 #ifdef notdef 113 static int isinsets(struct re_guts *g, int c); 114 static int samesets(struct re_guts *g, int c1, int c2); 115 #endif 116 static void categorize(struct parse *p, struct re_guts *g); 117 static sopno dupl(struct parse *p, sopno start, sopno finish); 118 static void doemit(struct parse *p, sop op, size_t opnd); 119 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 120 static void dofwd(struct parse *p, sopno pos, sop value); 121 static int enlarge(struct parse *p, sopno size); 122 static void stripsnug(struct parse *p, struct re_guts *g); 123 static void findmust(struct parse *p, struct re_guts *g); 124 static sopno pluscount(struct parse *p, struct re_guts *g); 125 126 #ifdef __cplusplus 127 } 128 #endif 129 /* ========= end header generated by ./mkh ========= */ 130 131 static RCHAR_T nuls[10]; /* place to point scanner in event of error */ 132 133 /* 134 * macros for use with parse structure 135 * BEWARE: these know that the parse structure is named `p' !!! 136 */ 137 #define PEEK() (*p->next) 138 #define PEEK2() (*(p->next+1)) 139 #define MORE() (p->next < p->end) 140 #define MORE2() (p->next+1 < p->end) 141 #define SEE(c) (MORE() && PEEK() == (c)) 142 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 143 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 144 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 145 #define NEXT() (p->next++) 146 #define NEXT2() (p->next += 2) 147 #define NEXTn(n) (p->next += (n)) 148 #define GETNEXT() (*p->next++) 149 #define SETERROR(e) seterr(p, (e)) 150 #define REQUIRE(co, e) ((co) || SETERROR(e)) 151 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 152 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 153 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 154 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 155 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 156 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 157 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 158 #define HERE() (p->slen) 159 #define THERE() (p->slen - 1) 160 #define THERETHERE() (p->slen - 2) 161 #define DROP(n) (p->slen -= (n)) 162 163 #ifndef NDEBUG 164 static int never = 0; /* for use in asserts; shuts lint up */ 165 #else 166 #define never 0 /* some <assert.h>s have bugs too */ 167 #endif 168 169 #define MEMLIMIT 0x8000000 170 #define MEMSIZE(p) \ 171 ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \ 172 (p)->ncsalloc * sizeof(cset) + \ 173 (p)->ssize * sizeof(sop)) 174 #define RECLIMIT 256 175 176 /* 177 - regcomp - interface for parser and compilation 178 */ 179 int /* 0 success, otherwise REG_something */ 180 regcomp(regex_t *preg, const RCHAR_T *pattern, int cflags) 181 { 182 struct parse pa; 183 struct re_guts *g; 184 struct parse *p = &pa; 185 int i; 186 size_t len; 187 #ifdef REDEBUG 188 # define GOODFLAGS(f) (f) 189 #else 190 # define GOODFLAGS(f) ((f)&~REG_DUMP) 191 #endif 192 193 cflags = GOODFLAGS(cflags); 194 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 195 return(REG_INVARG); 196 197 if (cflags®_PEND) { 198 if (preg->re_endp < pattern) 199 return(REG_INVARG); 200 len = preg->re_endp - pattern; 201 } else 202 len = STRLEN(pattern); 203 204 /* do the mallocs early so failure handling is easy */ 205 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 206 (NC-1)*sizeof(cat_t)); 207 if (g == NULL) 208 return(REG_ESPACE); 209 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 210 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 211 if (p->strip == NULL) { 212 free((char *)g); 213 return(REG_ESPACE); 214 } 215 p->stripdata = (RCHAR_T *)malloc(p->ssize * sizeof(RCHAR_T)); 216 if (p->stripdata == NULL) { 217 free((char *)p->strip); 218 free((char *)g); 219 return(REG_ESPACE); 220 } 221 p->slen = 0; 222 223 /* set things up */ 224 p->g = g; 225 p->next = (RCHAR_T *)pattern; /* convenience; we do not modify it */ 226 p->end = p->next + len; 227 p->error = 0; 228 p->ncsalloc = 0; 229 for (i = 0; i < NPAREN; i++) { 230 p->pbegin[i] = 0; 231 p->pend[i] = 0; 232 } 233 g->csetsize = NC; 234 g->sets = NULL; 235 g->setbits = NULL; 236 g->ncsets = 0; 237 g->cflags = cflags; 238 g->iflags = 0; 239 g->nbol = 0; 240 g->neol = 0; 241 g->must = NULL; 242 g->mlen = 0; 243 g->nsub = 0; 244 #if 0 245 g->ncategories = 1; /* category 0 is "everything else" */ 246 g->categories = &g->catspace[-(CHAR_MIN)]; 247 memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 248 #endif 249 g->backrefs = 0; 250 251 /* do it */ 252 EMIT(OEND, 0); 253 g->firststate = THERE(); 254 if (cflags®_EXTENDED) 255 p_ere(p, OUT, 0); 256 else if (cflags®_NOSPEC) 257 p_str(p); 258 else 259 p_bre(p, OUT, OUT, 0); 260 EMIT(OEND, 0); 261 g->laststate = THERE(); 262 263 /* tidy up loose ends and fill things in */ 264 categorize(p, g); 265 stripsnug(p, g); 266 findmust(p, g); 267 g->nplus = pluscount(p, g); 268 g->magic = MAGIC2; 269 preg->re_nsub = g->nsub; 270 preg->re_g = g; 271 preg->re_magic = MAGIC1; 272 #ifndef REDEBUG 273 /* not debugging, so can't rely on the assert() in regexec() */ 274 if (g->iflags&BAD) 275 SETERROR(REG_ASSERT); 276 #endif 277 278 /* win or lose, we're done */ 279 if (p->error != 0) /* lose */ 280 regfree(preg); 281 return(p->error); 282 } 283 284 /* 285 - p_ere - ERE parser top level, concatenation and alternation 286 */ 287 static void 288 p_ere(struct parse *p, int stop, size_t reclimit) 289 /* character this ERE should end at */ 290 { 291 char c; 292 sopno prevback = 0; 293 sopno prevfwd = 0; 294 sopno conc; 295 int first = 1; /* is this the first alternative? */ 296 297 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 298 p->error = REG_ESPACE; 299 return; 300 } 301 302 for (;;) { 303 /* do a bunch of concatenated expressions */ 304 conc = HERE(); 305 while (MORE() && (c = PEEK()) != '|' && c != stop) 306 p_ere_exp(p, reclimit); 307 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 308 309 if (!EAT('|')) 310 break; /* NOTE BREAK OUT */ 311 312 if (first) { 313 INSERT(OCH_, conc); /* offset is wrong */ 314 prevfwd = conc; 315 prevback = conc; 316 first = 0; 317 } 318 ASTERN(OOR1, prevback); 319 prevback = THERE(); 320 AHEAD(prevfwd); /* fix previous offset */ 321 prevfwd = HERE(); 322 EMIT(OOR2, 0); /* offset is very wrong */ 323 } 324 325 if (!first) { /* tail-end fixups */ 326 AHEAD(prevfwd); 327 ASTERN(O_CH, prevback); 328 } 329 330 assert(!MORE() || SEE(stop)); 331 } 332 333 /* 334 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 335 */ 336 static void 337 p_ere_exp(struct parse *p, size_t reclimit) 338 { 339 char c; 340 sopno pos; 341 int count; 342 int count2; 343 sopno subno; 344 int wascaret = 0; 345 346 assert(MORE()); /* caller should have ensured this */ 347 c = GETNEXT(); 348 349 pos = HERE(); 350 switch (c) { 351 case '(': 352 (void)REQUIRE(MORE(), REG_EPAREN); 353 p->g->nsub++; 354 subno = p->g->nsub; 355 if (subno < NPAREN) 356 p->pbegin[subno] = HERE(); 357 EMIT(OLPAREN, subno); 358 if (!SEE(')')) 359 p_ere(p, ')', reclimit); 360 if (subno < NPAREN) { 361 p->pend[subno] = HERE(); 362 assert(p->pend[subno] != 0); 363 } 364 EMIT(ORPAREN, subno); 365 (void)MUSTEAT(')', REG_EPAREN); 366 break; 367 #ifndef POSIX_MISTAKE 368 case ')': /* happens only if no current unmatched ( */ 369 /* 370 * You may ask, why the ifndef? Because I didn't notice 371 * this until slightly too late for 1003.2, and none of the 372 * other 1003.2 regular-expression reviewers noticed it at 373 * all. So an unmatched ) is legal POSIX, at least until 374 * we can get it fixed. 375 */ 376 SETERROR(REG_EPAREN); 377 break; 378 #endif 379 case '^': 380 EMIT(OBOL, 0); 381 p->g->iflags |= USEBOL; 382 p->g->nbol++; 383 wascaret = 1; 384 break; 385 case '$': 386 EMIT(OEOL, 0); 387 p->g->iflags |= USEEOL; 388 p->g->neol++; 389 break; 390 case '|': 391 SETERROR(REG_EMPTY); 392 break; 393 case '*': 394 case '+': 395 case '?': 396 SETERROR(REG_BADRPT); 397 break; 398 case '.': 399 if (p->g->cflags®_NEWLINE) 400 nonnewline(p); 401 else 402 EMIT(OANY, 0); 403 break; 404 case '[': 405 p_bracket(p); 406 break; 407 case '\\': 408 (void)REQUIRE(MORE(), REG_EESCAPE); 409 c = GETNEXT(); 410 ordinary(p, c); 411 break; 412 case '{': /* okay as ordinary except if digit follows */ 413 (void)REQUIRE(!MORE() || !ISDIGIT((UCHAR_T)PEEK()), REG_BADRPT); 414 /* FALLTHROUGH */ 415 default: 416 ordinary(p, c); 417 break; 418 } 419 420 if (!MORE()) 421 return; 422 c = PEEK(); 423 /* we call { a repetition if followed by a digit */ 424 if (!( c == '*' || c == '+' || c == '?' || 425 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) )) 426 return; /* no repetition, we're done */ 427 NEXT(); 428 429 (void)REQUIRE(!wascaret, REG_BADRPT); 430 switch (c) { 431 case '*': /* implemented as +? */ 432 /* this case does not require the (y|) trick, noKLUDGE */ 433 INSERT(OPLUS_, pos); 434 ASTERN(O_PLUS, pos); 435 INSERT(OQUEST_, pos); 436 ASTERN(O_QUEST, pos); 437 break; 438 case '+': 439 INSERT(OPLUS_, pos); 440 ASTERN(O_PLUS, pos); 441 break; 442 case '?': 443 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 444 INSERT(OCH_, pos); /* offset slightly wrong */ 445 ASTERN(OOR1, pos); /* this one's right */ 446 AHEAD(pos); /* fix the OCH_ */ 447 EMIT(OOR2, 0); /* offset very wrong... */ 448 AHEAD(THERE()); /* ...so fix it */ 449 ASTERN(O_CH, THERETHERE()); 450 break; 451 case '{': 452 count = p_count(p); 453 if (EAT(',')) { 454 if (ISDIGIT((UCHAR_T)PEEK())) { 455 count2 = p_count(p); 456 (void)REQUIRE(count <= count2, REG_BADBR); 457 } else /* single number with comma */ 458 count2 = INFINITY; 459 } else /* just a single number */ 460 count2 = count; 461 repeat(p, pos, count, count2, 0); 462 if (!EAT('}')) { /* error heuristics */ 463 while (MORE() && PEEK() != '}') 464 NEXT(); 465 (void)REQUIRE(MORE(), REG_EBRACE); 466 SETERROR(REG_BADBR); 467 } 468 break; 469 } 470 471 if (!MORE()) 472 return; 473 c = PEEK(); 474 if (!( c == '*' || c == '+' || c == '?' || 475 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ) ) 476 return; 477 SETERROR(REG_BADRPT); 478 } 479 480 /* 481 - p_str - string (no metacharacters) "parser" 482 */ 483 static void 484 p_str(struct parse *p) 485 { 486 (void)REQUIRE(MORE(), REG_EMPTY); 487 while (MORE()) 488 ordinary(p, GETNEXT()); 489 } 490 491 /* 492 - p_bre - BRE parser top level, anchoring and concatenation 493 * Giving end1 as OUT essentially eliminates the end1/end2 check. 494 * 495 * This implementation is a bit of a kludge, in that a trailing $ is first 496 * taken as an ordinary character and then revised to be an anchor. The 497 * only undesirable side effect is that '$' gets included as a character 498 * category in such cases. This is fairly harmless; not worth fixing. 499 * The amount of lookahead needed to avoid this kludge is excessive. 500 */ 501 static void 502 p_bre(struct parse *p, 503 int end1, /* first terminating character */ 504 int end2, /* second terminating character */ 505 size_t reclimit) 506 { 507 sopno start; 508 int first = 1; /* first subexpression? */ 509 int wasdollar = 0; 510 511 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 512 p->error = REG_ESPACE; 513 return; 514 } 515 516 start = HERE(); 517 518 if (EAT('^')) { 519 EMIT(OBOL, 0); 520 p->g->iflags |= USEBOL; 521 p->g->nbol++; 522 } 523 while (MORE() && !SEETWO(end1, end2)) { 524 wasdollar = p_simp_re(p, first, reclimit); 525 first = 0; 526 } 527 if (wasdollar) { /* oops, that was a trailing anchor */ 528 DROP(1); 529 EMIT(OEOL, 0); 530 p->g->iflags |= USEEOL; 531 p->g->neol++; 532 } 533 534 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 535 } 536 537 /* 538 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 539 */ 540 static int /* was the simple RE an unbackslashed $? */ 541 p_simp_re(struct parse *p, 542 int starordinary, /* is a leading * an ordinary character? */ 543 size_t reclimit) 544 { 545 int c; 546 int count; 547 int count2; 548 sopno pos; 549 int i; 550 sopno subno; 551 int backsl; 552 553 pos = HERE(); /* repetion op, if any, covers from here */ 554 555 assert(MORE()); /* caller should have ensured this */ 556 c = GETNEXT(); 557 backsl = c == '\\'; 558 if (backsl) { 559 (void)REQUIRE(MORE(), REG_EESCAPE); 560 c = (unsigned char)GETNEXT(); 561 switch (c) { 562 case '{': 563 SETERROR(REG_BADRPT); 564 break; 565 case '(': 566 p->g->nsub++; 567 subno = p->g->nsub; 568 if (subno < NPAREN) 569 p->pbegin[subno] = HERE(); 570 EMIT(OLPAREN, subno); 571 /* the MORE here is an error heuristic */ 572 if (MORE() && !SEETWO('\\', ')')) 573 p_bre(p, '\\', ')', reclimit); 574 if (subno < NPAREN) { 575 p->pend[subno] = HERE(); 576 assert(p->pend[subno] != 0); 577 } 578 EMIT(ORPAREN, subno); 579 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 580 break; 581 case ')': /* should not get here -- must be user */ 582 case '}': 583 SETERROR(REG_EPAREN); 584 break; 585 case '1': 586 case '2': 587 case '3': 588 case '4': 589 case '5': 590 case '6': 591 case '7': 592 case '8': 593 case '9': 594 i = c - '0'; 595 assert(i < NPAREN); 596 if (p->pend[i] != 0) { 597 assert(i <= p->g->nsub); 598 EMIT(OBACK_, i); 599 assert(p->pbegin[i] != 0); 600 assert(p->strip[p->pbegin[i]] == OLPAREN); 601 assert(p->strip[p->pend[i]] == ORPAREN); 602 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 603 EMIT(O_BACK, i); 604 } else 605 SETERROR(REG_ESUBREG); 606 p->g->backrefs = 1; 607 break; 608 default: 609 ordinary(p, c); 610 break; 611 } 612 } else { 613 switch (c) { 614 case '.': 615 if (p->g->cflags®_NEWLINE) 616 nonnewline(p); 617 else 618 EMIT(OANY, 0); 619 break; 620 case '[': 621 p_bracket(p); 622 break; 623 case '*': 624 (void)REQUIRE(starordinary, REG_BADRPT); 625 /* FALLTHROUGH */ 626 default: 627 ordinary(p, c); 628 break; 629 } 630 } 631 632 if (EAT('*')) { /* implemented as +? */ 633 /* this case does not require the (y|) trick, noKLUDGE */ 634 INSERT(OPLUS_, pos); 635 ASTERN(O_PLUS, pos); 636 INSERT(OQUEST_, pos); 637 ASTERN(O_QUEST, pos); 638 } else if (EATTWO('\\', '{')) { 639 count = p_count(p); 640 if (EAT(',')) { 641 if (MORE() && ISDIGIT((UCHAR_T)PEEK())) { 642 count2 = p_count(p); 643 (void)REQUIRE(count <= count2, REG_BADBR); 644 } else /* single number with comma */ 645 count2 = INFINITY; 646 } else /* just a single number */ 647 count2 = count; 648 repeat(p, pos, count, count2, reclimit); 649 if (!EATTWO('\\', '}')) { /* error heuristics */ 650 while (MORE() && !SEETWO('\\', '}')) 651 NEXT(); 652 (void)REQUIRE(MORE(), REG_EBRACE); 653 SETERROR(REG_BADBR); 654 } 655 } else if (!backsl && c == (unsigned char)'$') /* $ (but not \$) ends it */ 656 return(1); 657 658 return(0); 659 } 660 661 /* 662 - p_count - parse a repetition count 663 */ 664 static int /* the value */ 665 p_count(struct parse *p) 666 { 667 int count = 0; 668 int ndigits = 0; 669 670 while (MORE() && ISDIGIT((UCHAR_T)PEEK()) && count <= DUPMAX) { 671 count = count*10 + (GETNEXT() - '0'); 672 ndigits++; 673 } 674 675 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 676 return(count); 677 } 678 679 /* 680 - p_bracket - parse a bracketed character list 681 * 682 * Note a significant property of this code: if the allocset() did SETERROR, 683 * no set operations are done. 684 */ 685 static void 686 p_bracket(struct parse *p) 687 { 688 cset *cs; 689 int invert = 0; 690 static RCHAR_T bow[] = { '[', ':', '<', ':', ']', ']' }; 691 static RCHAR_T eow[] = { '[', ':', '>', ':', ']', ']' }; 692 693 cs = allocset(p); 694 if (cs == NULL) 695 return; 696 697 /* Dept of Truly Sickening Special-Case Kludges */ 698 if (p->next + 5 < p->end && MEMCMP(p->next, bow, 6) == 0) { 699 EMIT(OBOW, 0); 700 NEXTn(6); 701 return; 702 } 703 if (p->next + 5 < p->end && MEMCMP(p->next, eow, 6) == 0) { 704 EMIT(OEOW, 0); 705 NEXTn(6); 706 return; 707 } 708 709 if (EAT('^')) 710 invert++; /* make note to invert set at end */ 711 if (EAT(']')) 712 CHadd(cs, ']'); 713 else if (EAT('-')) 714 CHadd(cs, '-'); 715 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 716 p_b_term(p, cs); 717 if (EAT('-')) 718 CHadd(cs, '-'); 719 (void)MUSTEAT(']', REG_EBRACK); 720 721 if (p->error != 0) /* don't mess things up further */ 722 return; 723 724 if (p->g->cflags®_ICASE) { 725 int i; 726 int ci; 727 728 for (i = p->g->csetsize - 1; i >= 0; i--) 729 if (CHIN(cs, i) && isalpha(i)) { 730 ci = othercase(i); 731 if (ci != i) 732 CHadd(cs, ci); 733 } 734 if (cs->multis != NULL) 735 mccase(p, cs); 736 } 737 if (invert) { 738 int i; 739 740 for (i = p->g->csetsize - 1; i >= 0; i--) 741 if (CHIN(cs, i)) 742 CHsub(cs, i); 743 else 744 CHadd(cs, i); 745 if (p->g->cflags®_NEWLINE) 746 CHsub(cs, '\n'); 747 if (cs->multis != NULL) 748 mcinvert(p, cs); 749 } 750 751 assert(cs->multis == NULL); /* xxx */ 752 753 if (nch(p, cs) == 1) { /* optimize singleton sets */ 754 ordinary(p, firstch(p, cs)); 755 freeset(p, cs); 756 } else 757 EMIT(OANYOF, freezeset(p, cs)); 758 } 759 760 /* 761 - p_b_term - parse one term of a bracketed character list 762 */ 763 static void 764 p_b_term(struct parse *p, cset *cs) 765 { 766 char c; 767 char start, finish; 768 int i; 769 770 /* classify what we've got */ 771 switch ((MORE()) ? PEEK() : '\0') { 772 case '[': 773 c = (MORE2()) ? PEEK2() : '\0'; 774 break; 775 case '-': 776 SETERROR(REG_ERANGE); 777 return; /* NOTE RETURN */ 778 break; 779 default: 780 c = '\0'; 781 break; 782 } 783 784 switch (c) { 785 case ':': /* character class */ 786 NEXT2(); 787 (void)REQUIRE(MORE(), REG_EBRACK); 788 c = PEEK(); 789 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 790 p_b_cclass(p, cs); 791 (void)REQUIRE(MORE(), REG_EBRACK); 792 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 793 break; 794 case '=': /* equivalence class */ 795 NEXT2(); 796 (void)REQUIRE(MORE(), REG_EBRACK); 797 c = PEEK(); 798 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 799 p_b_eclass(p, cs); 800 (void)REQUIRE(MORE(), REG_EBRACK); 801 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 802 break; 803 default: /* symbol, ordinary character, or range */ 804 /* xxx revision needed for multichar stuff */ 805 start = p_b_symbol(p); 806 if (SEE('-') && MORE2() && PEEK2() != ']') { 807 /* range */ 808 NEXT(); 809 if (EAT('-')) 810 finish = '-'; 811 else 812 finish = p_b_symbol(p); 813 } else 814 finish = start; 815 /* xxx what about signed chars here... */ 816 (void)REQUIRE(start <= finish, REG_ERANGE); 817 for (i = start; i <= finish; i++) 818 CHadd(cs, i); 819 break; 820 } 821 } 822 823 /* 824 - p_b_cclass - parse a character-class name and deal with it 825 */ 826 static void 827 p_b_cclass(struct parse *p, cset *cs) 828 { 829 RCHAR_T *sp = p->next; 830 struct cclass *cp; 831 size_t len; 832 const char *u; 833 char c; 834 835 while (MORE() && isalpha(PEEK())) 836 NEXT(); 837 len = p->next - sp; 838 for (cp = cclasses; cp->name != NULL; cp++) 839 if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len)) 840 break; 841 if (cp->name == NULL) { 842 /* oops, didn't find it */ 843 SETERROR(REG_ECTYPE); 844 return; 845 } 846 847 u = cp->chars; 848 while ((c = *u++) != '\0') 849 CHadd(cs, c); 850 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 851 MCadd(p, cs, u); 852 } 853 854 /* 855 - p_b_eclass - parse an equivalence-class name and deal with it 856 * 857 * This implementation is incomplete. xxx 858 */ 859 static void 860 p_b_eclass(struct parse *p, cset *cs) 861 { 862 char c; 863 864 c = p_b_coll_elem(p, '='); 865 CHadd(cs, c); 866 } 867 868 /* 869 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 870 */ 871 static char /* value of symbol */ 872 p_b_symbol(struct parse *p) 873 { 874 char value; 875 876 (void)REQUIRE(MORE(), REG_EBRACK); 877 if (!EATTWO('[', '.')) 878 return(GETNEXT()); 879 880 /* collating symbol */ 881 value = p_b_coll_elem(p, '.'); 882 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 883 return(value); 884 } 885 886 /* 887 - p_b_coll_elem - parse a collating-element name and look it up 888 */ 889 static char /* value of collating element */ 890 p_b_coll_elem(struct parse *p, int endc) 891 892 /* name ended by endc,']' */ 893 { 894 RCHAR_T *sp = p->next; 895 struct cname *cp; 896 size_t len; 897 898 while (MORE() && !SEETWO(endc, ']')) 899 NEXT(); 900 if (!MORE()) { 901 SETERROR(REG_EBRACK); 902 return(0); 903 } 904 len = p->next - sp; 905 for (cp = cnames; cp->name != NULL; cp++) 906 if (STRLEN(cp->name) == len && MEMCMP(cp->name, sp, len)) 907 return(cp->code); /* known name */ 908 if (len == 1) 909 return(*sp); /* single character */ 910 SETERROR(REG_ECOLLATE); /* neither */ 911 return(0); 912 } 913 914 /* 915 - othercase - return the case counterpart of an alphabetic 916 */ 917 static char /* if no counterpart, return ch */ 918 othercase(int ch) 919 { 920 assert(isalpha(ch)); 921 if (isupper(ch)) 922 return(tolower(ch)); 923 else if (islower(ch)) 924 return(toupper(ch)); 925 else /* peculiar, but could happen */ 926 return(ch); 927 } 928 929 /* 930 - bothcases - emit a dualcase version of a two-case character 931 * 932 * Boy, is this implementation ever a kludge... 933 */ 934 static void 935 bothcases(struct parse *p, int ch) 936 { 937 RCHAR_T *oldnext = p->next; 938 RCHAR_T *oldend = p->end; 939 RCHAR_T bracket[3]; 940 941 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 942 p->next = bracket; 943 p->end = bracket+2; 944 bracket[0] = ch; 945 bracket[1] = ']'; 946 bracket[2] = '\0'; 947 p_bracket(p); 948 assert(p->next == bracket+2); 949 p->next = oldnext; 950 p->end = oldend; 951 } 952 953 /* 954 - ordinary - emit an ordinary character 955 */ 956 static void 957 ordinary(struct parse *p, int ch) 958 { 959 /* 960 cat_t *cap = p->g->categories; 961 */ 962 963 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 964 bothcases(p, ch); 965 else { 966 EMIT(OCHAR, (UCHAR_T)ch); 967 /* 968 if (cap[ch] == 0) 969 cap[ch] = p->g->ncategories++; 970 */ 971 } 972 } 973 974 /* 975 - nonnewline - emit REG_NEWLINE version of OANY 976 * 977 * Boy, is this implementation ever a kludge... 978 */ 979 static void 980 nonnewline(struct parse *p) 981 { 982 RCHAR_T *oldnext = p->next; 983 RCHAR_T *oldend = p->end; 984 RCHAR_T bracket[4]; 985 986 p->next = bracket; 987 p->end = bracket+3; 988 bracket[0] = '^'; 989 bracket[1] = '\n'; 990 bracket[2] = ']'; 991 bracket[3] = '\0'; 992 p_bracket(p); 993 assert(p->next == bracket+3); 994 p->next = oldnext; 995 p->end = oldend; 996 } 997 998 /* 999 - repeat - generate code for a bounded repetition, recursively if needed 1000 */ 1001 static void 1002 repeat(struct parse *p, 1003 sopno start, /* operand from here to end of strip */ 1004 int from, /* repeated from this number */ 1005 int to, /* to this number of times (maybe INFINITY) */ 1006 size_t reclimit) 1007 { 1008 sopno finish; 1009 # define N 2 1010 # define INF 3 1011 # define REP(f, t) ((f)*8 + (t)) 1012 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1013 sopno copy; 1014 1015 if (reclimit++ > RECLIMIT) 1016 p->error = REG_ESPACE; 1017 if (p->error) 1018 return; 1019 1020 finish = HERE(); 1021 1022 assert(from <= to); 1023 1024 switch (REP(MAP(from), MAP(to))) { 1025 case REP(0, 0): /* must be user doing this */ 1026 DROP(finish-start); /* drop the operand */ 1027 break; 1028 case REP(0, 1): /* as x{1,1}? */ 1029 case REP(0, N): /* as x{1,n}? */ 1030 case REP(0, INF): /* as x{1,}? */ 1031 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1032 INSERT(OCH_, start); /* offset is wrong... */ 1033 repeat(p, start+1, 1, to, reclimit); 1034 ASTERN(OOR1, start); 1035 AHEAD(start); /* ... fix it */ 1036 EMIT(OOR2, 0); 1037 AHEAD(THERE()); 1038 ASTERN(O_CH, THERETHERE()); 1039 break; 1040 case REP(1, 1): /* trivial case */ 1041 /* done */ 1042 break; 1043 case REP(1, N): /* as x?x{1,n-1} */ 1044 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1045 INSERT(OCH_, start); 1046 ASTERN(OOR1, start); 1047 AHEAD(start); 1048 EMIT(OOR2, 0); /* offset very wrong... */ 1049 AHEAD(THERE()); /* ...so fix it */ 1050 ASTERN(O_CH, THERETHERE()); 1051 copy = dupl(p, start+1, finish+1); 1052 assert(copy == finish+4); 1053 repeat(p, copy, 1, to-1, reclimit); 1054 break; 1055 case REP(1, INF): /* as x+ */ 1056 INSERT(OPLUS_, start); 1057 ASTERN(O_PLUS, start); 1058 break; 1059 case REP(N, N): /* as xx{m-1,n-1} */ 1060 copy = dupl(p, start, finish); 1061 repeat(p, copy, from-1, to-1, reclimit); 1062 break; 1063 case REP(N, INF): /* as xx{n-1,INF} */ 1064 copy = dupl(p, start, finish); 1065 repeat(p, copy, from-1, to, reclimit); 1066 break; 1067 default: /* "can't happen" */ 1068 SETERROR(REG_ASSERT); /* just in case */ 1069 break; 1070 } 1071 } 1072 1073 /* 1074 - seterr - set an error condition 1075 */ 1076 static int /* useless but makes type checking happy */ 1077 seterr(struct parse *p, int e) 1078 { 1079 if (p->error == 0) /* keep earliest error condition */ 1080 p->error = e; 1081 p->next = nuls; /* try to bring things to a halt */ 1082 p->end = nuls; 1083 return(0); /* make the return value well-defined */ 1084 } 1085 1086 /* 1087 - allocset - allocate a set of characters for [] 1088 */ 1089 static cset * 1090 allocset(struct parse *p) 1091 { 1092 int no = p->g->ncsets++; 1093 size_t nc; 1094 size_t nbytes; 1095 cset *cs; 1096 size_t css = (size_t)p->g->csetsize; 1097 int i; 1098 1099 if (no >= p->ncsalloc) { /* need another column of space */ 1100 p->ncsalloc += CHAR_BIT; 1101 nc = p->ncsalloc; 1102 assert(nc % CHAR_BIT == 0); 1103 nbytes = nc / CHAR_BIT * css; 1104 if (MEMSIZE(p) > MEMLIMIT) 1105 goto oomem; 1106 if (p->g->sets == NULL) 1107 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1108 else 1109 p->g->sets = (cset *)realloc((char *)p->g->sets, 1110 nc * sizeof(cset)); 1111 if (p->g->setbits == NULL) 1112 p->g->setbits = (uch *)malloc(nbytes); 1113 else { 1114 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1115 nbytes); 1116 /* xxx this isn't right if setbits is now NULL */ 1117 for (i = 0; i < no; i++) 1118 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1119 } 1120 if (p->g->sets != NULL && p->g->setbits != NULL) 1121 memset((char *)p->g->setbits + (nbytes - css), 1122 0, css); 1123 else { 1124 oomem: 1125 no = 0; 1126 SETERROR(REG_ESPACE); 1127 /* caller's responsibility not to do set ops */ 1128 return NULL; 1129 } 1130 } 1131 1132 cs = &p->g->sets[no]; 1133 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1134 cs->mask = 1 << ((no) % CHAR_BIT); 1135 cs->hash = 0; 1136 cs->smultis = 0; 1137 cs->multis = NULL; 1138 1139 return(cs); 1140 } 1141 1142 /* 1143 - freeset - free a now-unused set 1144 */ 1145 static void 1146 freeset(struct parse *p, cset *cs) 1147 { 1148 size_t i; 1149 cset *top = &p->g->sets[p->g->ncsets]; 1150 size_t css = (size_t)p->g->csetsize; 1151 1152 for (i = 0; i < css; i++) 1153 CHsub(cs, i); 1154 if (cs == top-1) /* recover only the easy case */ 1155 p->g->ncsets--; 1156 } 1157 1158 /* 1159 - freezeset - final processing on a set of characters 1160 * 1161 * The main task here is merging identical sets. This is usually a waste 1162 * of time (although the hash code minimizes the overhead), but can win 1163 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1164 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1165 * the same value! 1166 */ 1167 static int /* set number */ 1168 freezeset(struct parse *p, cset *cs) 1169 { 1170 uch h = cs->hash; 1171 size_t i; 1172 cset *top = &p->g->sets[p->g->ncsets]; 1173 cset *cs2; 1174 size_t css = (size_t)p->g->csetsize; 1175 1176 /* look for an earlier one which is the same */ 1177 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1178 if (cs2->hash == h && cs2 != cs) { 1179 /* maybe */ 1180 for (i = 0; i < css; i++) 1181 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1182 break; /* no */ 1183 if (i == css) 1184 break; /* yes */ 1185 } 1186 1187 if (cs2 < top) { /* found one */ 1188 freeset(p, cs); 1189 cs = cs2; 1190 } 1191 1192 return((int)(cs - p->g->sets)); 1193 } 1194 1195 /* 1196 - firstch - return first character in a set (which must have at least one) 1197 */ 1198 static int /* character; there is no "none" value */ 1199 firstch(struct parse *p, cset *cs) 1200 { 1201 size_t i; 1202 size_t css = (size_t)p->g->csetsize; 1203 1204 for (i = 0; i < css; i++) 1205 if (CHIN(cs, i)) 1206 return((char)i); 1207 assert(never); 1208 return(0); /* arbitrary */ 1209 } 1210 1211 /* 1212 - nch - number of characters in a set 1213 */ 1214 static int 1215 nch(struct parse *p, cset *cs) 1216 { 1217 size_t i; 1218 size_t css = (size_t)p->g->csetsize; 1219 int n = 0; 1220 1221 for (i = 0; i < css; i++) 1222 if (CHIN(cs, i)) 1223 n++; 1224 return(n); 1225 } 1226 1227 /* 1228 - mcadd - add a collating element to a cset 1229 */ 1230 static void 1231 mcadd(struct parse *p, cset *cs, const char *cp) 1232 { 1233 size_t oldend = cs->smultis; 1234 void *np; 1235 1236 cs->smultis += strlen(cp) + 1; 1237 np = realloc(cs->multis, cs->smultis); 1238 if (np == NULL) { 1239 if (cs->multis) 1240 free(cs->multis); 1241 cs->multis = NULL; 1242 SETERROR(REG_ESPACE); 1243 return; 1244 } 1245 cs->multis = np; 1246 1247 strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); 1248 } 1249 1250 /* 1251 - mcinvert - invert the list of collating elements in a cset 1252 * 1253 * This would have to know the set of possibilities. Implementation 1254 * is deferred. 1255 */ 1256 static void 1257 mcinvert(struct parse *p, cset *cs) 1258 { 1259 assert(cs->multis == NULL); /* xxx */ 1260 } 1261 1262 /* 1263 - mccase - add case counterparts of the list of collating elements in a cset 1264 * 1265 * This would have to know the set of possibilities. Implementation 1266 * is deferred. 1267 */ 1268 static void 1269 mccase(struct parse *p, cset *cs) 1270 { 1271 assert(cs->multis == NULL); /* xxx */ 1272 } 1273 1274 #ifdef notdef 1275 /* 1276 - isinsets - is this character in any sets? 1277 */ 1278 static int /* predicate */ 1279 isinsets(struct re_guts *g, int c) 1280 { 1281 uch *col; 1282 int i; 1283 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1284 unsigned uc = (unsigned char)c; 1285 1286 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1287 if (col[uc] != 0) 1288 return(1); 1289 return(0); 1290 } 1291 1292 /* 1293 - samesets - are these two characters in exactly the same sets? 1294 */ 1295 static int /* predicate */ 1296 samesets(struct re_guts *g, int c1, int c2) 1297 { 1298 uch *col; 1299 int i; 1300 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1301 unsigned uc1 = (unsigned char)c1; 1302 unsigned uc2 = (unsigned char)c2; 1303 1304 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1305 if (col[uc1] != col[uc2]) 1306 return(0); 1307 return(1); 1308 } 1309 #endif 1310 1311 /* 1312 - categorize - sort out character categories 1313 */ 1314 static void 1315 categorize(struct parse *p, struct re_guts *g) 1316 { 1317 #ifdef notdef 1318 cat_t *cats = g->categories; 1319 int c; 1320 int c2; 1321 cat_t cat; 1322 1323 /* avoid making error situations worse */ 1324 if (p->error != 0) 1325 return; 1326 1327 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1328 if (cats[c] == 0 && isinsets(g, c)) { 1329 cat = g->ncategories++; 1330 cats[c] = cat; 1331 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1332 if (cats[c2] == 0 && samesets(g, c, c2)) 1333 cats[c2] = cat; 1334 } 1335 #endif 1336 } 1337 1338 /* 1339 - dupl - emit a duplicate of a bunch of sops 1340 */ 1341 static sopno /* start of duplicate */ 1342 dupl(struct parse *p, 1343 sopno start, /* from here */ 1344 sopno finish) /* to this less one */ 1345 { 1346 sopno ret = HERE(); 1347 sopno len = finish - start; 1348 1349 assert(finish >= start); 1350 if (len == 0) 1351 return(ret); 1352 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1353 return ret; 1354 assert(p->ssize >= p->slen + len); 1355 (void) memcpy((char *)(p->strip + p->slen), 1356 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1357 (void) memcpy((char *)(p->stripdata + p->slen), 1358 (char *)(p->stripdata + start), (size_t)len*sizeof(RCHAR_T)); 1359 p->slen += len; 1360 return(ret); 1361 } 1362 1363 /* 1364 - doemit - emit a strip operator 1365 * 1366 * It might seem better to implement this as a macro with a function as 1367 * hard-case backup, but it's just too big and messy unless there are 1368 * some changes to the data structures. Maybe later. 1369 */ 1370 static void 1371 doemit(struct parse *p, sop op, size_t opnd) 1372 { 1373 /* avoid making error situations worse */ 1374 if (p->error != 0) 1375 return; 1376 1377 /* deal with oversize operands ("can't happen", more or less) */ 1378 assert(opnd < 1); 1379 1380 /* deal with undersized strip */ 1381 if (p->slen >= p->ssize) 1382 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1383 return; 1384 1385 /* finally, it's all reduced to the easy case */ 1386 p->strip[p->slen] = op; 1387 p->stripdata[p->slen] = opnd; 1388 p->slen++; 1389 } 1390 1391 /* 1392 - doinsert - insert a sop into the strip 1393 */ 1394 static void 1395 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1396 { 1397 sopno sn; 1398 sop s; 1399 RCHAR_T d; 1400 int i; 1401 1402 /* avoid making error situations worse */ 1403 if (p->error != 0) 1404 return; 1405 1406 sn = HERE(); 1407 EMIT(op, opnd); /* do checks, ensure space */ 1408 assert(HERE() == sn+1); 1409 s = p->strip[sn]; 1410 d = p->stripdata[sn]; 1411 1412 /* adjust paren pointers */ 1413 assert(pos > 0); 1414 for (i = 1; i < NPAREN; i++) { 1415 if (p->pbegin[i] >= pos) { 1416 p->pbegin[i]++; 1417 } 1418 if (p->pend[i] >= pos) { 1419 p->pend[i]++; 1420 } 1421 } 1422 1423 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1424 (HERE()-pos-1)*sizeof(sop)); 1425 memmove((char *)&p->stripdata[pos+1], (char *)&p->stripdata[pos], 1426 (HERE()-pos-1)*sizeof(RCHAR_T)); 1427 p->strip[pos] = s; 1428 p->stripdata[pos] = d; 1429 } 1430 1431 /* 1432 - dofwd - complete a forward reference 1433 */ 1434 static void 1435 dofwd(struct parse *p, sopno pos, sop value) 1436 { 1437 /* avoid making error situations worse */ 1438 if (p->error != 0) 1439 return; 1440 1441 assert(value < 1); 1442 p->stripdata[pos] = value; 1443 } 1444 1445 /* 1446 - enlarge - enlarge the strip 1447 */ 1448 static int 1449 enlarge(struct parse *p, sopno size) 1450 { 1451 sop *sp; 1452 RCHAR_T *dp; 1453 sopno osize; 1454 1455 if (p->ssize >= size) 1456 return 1; 1457 1458 osize = p->ssize; 1459 p->ssize = size; 1460 if (MEMSIZE(p) > MEMLIMIT) 1461 goto oomem; 1462 sp = realloc(p->strip, p->ssize * sizeof(sop)); 1463 if (sp == NULL) 1464 goto oomem; 1465 p->strip = sp; 1466 dp = realloc(p->stripdata, p->ssize * sizeof(RCHAR_T)); 1467 if (dp == NULL) { 1468 oomem: 1469 p->ssize = osize; 1470 SETERROR(REG_ESPACE); 1471 return 0; 1472 } 1473 p->stripdata = dp; 1474 return 1; 1475 } 1476 1477 /* 1478 - stripsnug - compact the strip 1479 */ 1480 static void 1481 stripsnug(struct parse *p, struct re_guts *g) 1482 { 1483 g->nstates = p->slen; 1484 g->strip = (sop *)realloc((char *)p->strip, 1485 p->slen * sizeof(sop)); 1486 if (g->strip == NULL) { 1487 SETERROR(REG_ESPACE); 1488 g->strip = p->strip; 1489 } 1490 g->stripdata = (RCHAR_T *)realloc((char *)p->stripdata, 1491 p->slen * sizeof(RCHAR_T)); 1492 if (g->stripdata == NULL) { 1493 SETERROR(REG_ESPACE); 1494 g->stripdata = p->stripdata; 1495 } 1496 } 1497 1498 /* 1499 - findmust - fill in must and mlen with longest mandatory literal string 1500 * 1501 * This algorithm could do fancy things like analyzing the operands of | 1502 * for common subsequences. Someday. This code is simple and finds most 1503 * of the interesting cases. 1504 * 1505 * Note that must and mlen got initialized during setup. 1506 */ 1507 static void 1508 findmust(struct parse *p, struct re_guts *g) 1509 { 1510 sop *scans; 1511 RCHAR_T *scand; 1512 sop *starts = 0; 1513 RCHAR_T *startd = NULL; 1514 sop *newstarts = 0; 1515 RCHAR_T *newstartd = NULL; 1516 sopno newlen; 1517 sop s; 1518 RCHAR_T d; 1519 RCHAR_T *cp; 1520 sopno i; 1521 1522 /* avoid making error situations worse */ 1523 if (p->error != 0) 1524 return; 1525 1526 /* find the longest OCHAR sequence in strip */ 1527 newlen = 0; 1528 scans = g->strip + 1; 1529 scand = g->stripdata + 1; 1530 do { 1531 s = *scans++; 1532 d = *scand++; 1533 switch (s) { 1534 case OCHAR: /* sequence member */ 1535 if (newlen == 0) { /* new sequence */ 1536 newstarts = scans - 1; 1537 newstartd = scand - 1; 1538 } 1539 newlen++; 1540 break; 1541 case OPLUS_: /* things that don't break one */ 1542 case OLPAREN: 1543 case ORPAREN: 1544 break; 1545 case OQUEST_: /* things that must be skipped */ 1546 case OCH_: 1547 scans--; 1548 scand--; 1549 do { 1550 scans += d; 1551 scand += d; 1552 s = *scans; 1553 d = *scand; 1554 /* assert() interferes w debug printouts */ 1555 if (s != O_QUEST && s != O_CH && s != OOR2) { 1556 g->iflags |= BAD; 1557 return; 1558 } 1559 } while (s != O_QUEST && s != O_CH); 1560 /* fallthrough */ 1561 default: /* things that break a sequence */ 1562 if (newlen > g->mlen) { /* ends one */ 1563 starts = newstarts; 1564 startd = newstartd; 1565 g->mlen = newlen; 1566 } 1567 newlen = 0; 1568 break; 1569 } 1570 } while (s != OEND); 1571 1572 if (g->mlen == 0) /* there isn't one */ 1573 return; 1574 1575 /* turn it into a character string */ 1576 g->must = malloc(((size_t)g->mlen + 1) * sizeof(RCHAR_T)); 1577 if (g->must == NULL) { /* argh; just forget it */ 1578 g->mlen = 0; 1579 return; 1580 } 1581 cp = g->must; 1582 scans = starts; 1583 scand = startd; 1584 for (i = g->mlen; i > 0; i--) { 1585 for (;;) { 1586 s = *scans++; 1587 d = *scand++; 1588 if (s == OCHAR) 1589 break; 1590 } 1591 assert(cp < g->must + g->mlen); 1592 *cp++ = d; 1593 } 1594 assert(cp == g->must + g->mlen); 1595 *cp++ = '\0'; /* just on general principles */ 1596 } 1597 1598 /* 1599 - pluscount - count + nesting 1600 */ 1601 static sopno /* nesting depth */ 1602 pluscount(struct parse *p, struct re_guts *g) 1603 { 1604 sop *scan; 1605 sop s; 1606 sopno plusnest = 0; 1607 sopno maxnest = 0; 1608 1609 if (p->error != 0) 1610 return(0); /* there may not be an OEND */ 1611 1612 scan = g->strip + 1; 1613 do { 1614 s = *scan++; 1615 switch (s) { 1616 case OPLUS_: 1617 plusnest++; 1618 break; 1619 case O_PLUS: 1620 if (plusnest > maxnest) 1621 maxnest = plusnest; 1622 plusnest--; 1623 break; 1624 } 1625 } while (s != OEND); 1626 if (plusnest != 0) 1627 g->iflags |= BAD; 1628 return(maxnest); 1629 } 1630