1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 34 */ 35 36 #if defined(LIBC_SCCS) && !defined(lint) 37 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 38 #endif /* LIBC_SCCS and not lint */ 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include <sys/types.h> 43 #include <stdio.h> 44 #include <string.h> 45 #include <ctype.h> 46 #include <limits.h> 47 #include <stdlib.h> 48 #include <regex.h> 49 #include <runetype.h> 50 #include <wchar.h> 51 #include <wctype.h> 52 53 #include "collate.h" 54 55 #include "utils.h" 56 #include "regex2.h" 57 58 #include "cname.h" 59 60 /* 61 * parse structure, passed up and down to avoid global variables and 62 * other clumsinesses 63 */ 64 struct parse { 65 char *next; /* next character in RE */ 66 char *end; /* end of string (-> NUL normally) */ 67 int error; /* has an error been seen? */ 68 sop *strip; /* malloced strip */ 69 sopno ssize; /* malloced strip size (allocated) */ 70 sopno slen; /* malloced strip length (used) */ 71 int ncsalloc; /* number of csets allocated */ 72 struct re_guts *g; 73 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 74 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 75 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 76 }; 77 78 /* ========= begin header generated by ./mkh ========= */ 79 #ifdef __cplusplus 80 extern "C" { 81 #endif 82 83 /* === regcomp.c === */ 84 static void p_ere(struct parse *p, wint_t stop); 85 static void p_ere_exp(struct parse *p); 86 static void p_str(struct parse *p); 87 static void p_bre(struct parse *p, wint_t end1, wint_t end2); 88 static int p_simp_re(struct parse *p, int starordinary); 89 static int p_count(struct parse *p); 90 static void p_bracket(struct parse *p); 91 static void p_b_term(struct parse *p, cset *cs); 92 static void p_b_cclass(struct parse *p, cset *cs); 93 static void p_b_eclass(struct parse *p, cset *cs); 94 static wint_t p_b_symbol(struct parse *p); 95 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 96 static wint_t othercase(wint_t ch); 97 static void bothcases(struct parse *p, wint_t ch); 98 static void ordinary(struct parse *p, wint_t ch); 99 static void nonnewline(struct parse *p); 100 static void repeat(struct parse *p, sopno start, int from, int to); 101 static int seterr(struct parse *p, int e); 102 static cset *allocset(struct parse *p); 103 static void freeset(struct parse *p, cset *cs); 104 static void CHadd(struct parse *p, cset *cs, wint_t ch); 105 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 106 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 107 static wint_t singleton(cset *cs); 108 static sopno dupl(struct parse *p, sopno start, sopno finish); 109 static void doemit(struct parse *p, sop op, size_t opnd); 110 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 111 static void dofwd(struct parse *p, sopno pos, sop value); 112 static void enlarge(struct parse *p, sopno size); 113 static void stripsnug(struct parse *p, struct re_guts *g); 114 static void findmust(struct parse *p, struct re_guts *g); 115 static int altoffset(sop *scan, int offset); 116 static void computejumps(struct parse *p, struct re_guts *g); 117 static void computematchjumps(struct parse *p, struct re_guts *g); 118 static sopno pluscount(struct parse *p, struct re_guts *g); 119 static wint_t wgetnext(struct parse *p); 120 121 #ifdef __cplusplus 122 } 123 #endif 124 /* ========= end header generated by ./mkh ========= */ 125 126 static char nuls[10]; /* place to point scanner in event of error */ 127 128 /* 129 * macros for use with parse structure 130 * BEWARE: these know that the parse structure is named `p' !!! 131 */ 132 #define PEEK() (*p->next) 133 #define PEEK2() (*(p->next+1)) 134 #define MORE() (p->next < p->end) 135 #define MORE2() (p->next+1 < p->end) 136 #define SEE(c) (MORE() && PEEK() == (c)) 137 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 138 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 139 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 140 #define NEXT() (p->next++) 141 #define NEXT2() (p->next += 2) 142 #define NEXTn(n) (p->next += (n)) 143 #define GETNEXT() (*p->next++) 144 #define WGETNEXT() wgetnext(p) 145 #define SETERROR(e) seterr(p, (e)) 146 #define REQUIRE(co, e) ((co) || SETERROR(e)) 147 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 148 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 149 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 150 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 151 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 152 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 153 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 154 #define HERE() (p->slen) 155 #define THERE() (p->slen - 1) 156 #define THERETHERE() (p->slen - 2) 157 #define DROP(n) (p->slen -= (n)) 158 159 #ifndef NDEBUG 160 static int never = 0; /* for use in asserts; shuts lint up */ 161 #else 162 #define never 0 /* some <assert.h>s have bugs too */ 163 #endif 164 165 /* Macro used by computejump()/computematchjump() */ 166 #define MIN(a,b) ((a)<(b)?(a):(b)) 167 168 /* 169 - regcomp - interface for parser and compilation 170 = extern int regcomp(regex_t *, const char *, int); 171 = #define REG_BASIC 0000 172 = #define REG_EXTENDED 0001 173 = #define REG_ICASE 0002 174 = #define REG_NOSUB 0004 175 = #define REG_NEWLINE 0010 176 = #define REG_NOSPEC 0020 177 = #define REG_PEND 0040 178 = #define REG_DUMP 0200 179 */ 180 int /* 0 success, otherwise REG_something */ 181 regcomp(preg, pattern, cflags) 182 regex_t * __restrict preg; 183 const char * __restrict pattern; 184 int cflags; 185 { 186 struct parse pa; 187 struct re_guts *g; 188 struct parse *p = &pa; 189 int i; 190 size_t len; 191 #ifdef REDEBUG 192 # define GOODFLAGS(f) (f) 193 #else 194 # define GOODFLAGS(f) ((f)&~REG_DUMP) 195 #endif 196 197 cflags = GOODFLAGS(cflags); 198 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 199 return(REG_INVARG); 200 201 if (cflags®_PEND) { 202 if (preg->re_endp < pattern) 203 return(REG_INVARG); 204 len = preg->re_endp - pattern; 205 } else 206 len = strlen((char *)pattern); 207 208 /* do the mallocs early so failure handling is easy */ 209 g = (struct re_guts *)malloc(sizeof(struct re_guts)); 210 if (g == NULL) 211 return(REG_ESPACE); 212 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 213 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 214 p->slen = 0; 215 if (p->strip == NULL) { 216 free((char *)g); 217 return(REG_ESPACE); 218 } 219 220 /* set things up */ 221 p->g = g; 222 p->next = (char *)pattern; /* convenience; we do not modify it */ 223 p->end = p->next + len; 224 p->error = 0; 225 p->ncsalloc = 0; 226 for (i = 0; i < NPAREN; i++) { 227 p->pbegin[i] = 0; 228 p->pend[i] = 0; 229 } 230 g->sets = NULL; 231 g->ncsets = 0; 232 g->cflags = cflags; 233 g->iflags = 0; 234 g->nbol = 0; 235 g->neol = 0; 236 g->must = NULL; 237 g->moffset = -1; 238 g->charjump = NULL; 239 g->matchjump = NULL; 240 g->mlen = 0; 241 g->nsub = 0; 242 g->backrefs = 0; 243 244 /* do it */ 245 EMIT(OEND, 0); 246 g->firststate = THERE(); 247 if (cflags®_EXTENDED) 248 p_ere(p, OUT); 249 else if (cflags®_NOSPEC) 250 p_str(p); 251 else 252 p_bre(p, OUT, OUT); 253 EMIT(OEND, 0); 254 g->laststate = THERE(); 255 256 /* tidy up loose ends and fill things in */ 257 stripsnug(p, g); 258 findmust(p, g); 259 /* only use Boyer-Moore algorithm if the pattern is bigger 260 * than three characters 261 */ 262 if(g->mlen > 3) { 263 computejumps(p, g); 264 computematchjumps(p, g); 265 if(g->matchjump == NULL && g->charjump != NULL) { 266 free(g->charjump); 267 g->charjump = NULL; 268 } 269 } 270 g->nplus = pluscount(p, g); 271 g->magic = MAGIC2; 272 preg->re_nsub = g->nsub; 273 preg->re_g = g; 274 preg->re_magic = MAGIC1; 275 #ifndef REDEBUG 276 /* not debugging, so can't rely on the assert() in regexec() */ 277 if (g->iflags&BAD) 278 SETERROR(REG_ASSERT); 279 #endif 280 281 /* win or lose, we're done */ 282 if (p->error != 0) /* lose */ 283 regfree(preg); 284 return(p->error); 285 } 286 287 /* 288 - p_ere - ERE parser top level, concatenation and alternation 289 == static void p_ere(struct parse *p, int stop); 290 */ 291 static void 292 p_ere(p, stop) 293 struct parse *p; 294 int stop; /* character this ERE should end at */ 295 { 296 char c; 297 sopno prevback; 298 sopno prevfwd; 299 sopno conc; 300 int first = 1; /* is this the first alternative? */ 301 302 for (;;) { 303 /* do a bunch of concatenated expressions */ 304 conc = HERE(); 305 while (MORE() && (c = PEEK()) != '|' && c != stop) 306 p_ere_exp(p); 307 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 308 309 if (!EAT('|')) 310 break; /* NOTE BREAK OUT */ 311 312 if (first) { 313 INSERT(OCH_, conc); /* offset is wrong */ 314 prevfwd = conc; 315 prevback = conc; 316 first = 0; 317 } 318 ASTERN(OOR1, prevback); 319 prevback = THERE(); 320 AHEAD(prevfwd); /* fix previous offset */ 321 prevfwd = HERE(); 322 EMIT(OOR2, 0); /* offset is very wrong */ 323 } 324 325 if (!first) { /* tail-end fixups */ 326 AHEAD(prevfwd); 327 ASTERN(O_CH, prevback); 328 } 329 330 assert(!MORE() || SEE(stop)); 331 } 332 333 /* 334 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 335 == static void p_ere_exp(struct parse *p); 336 */ 337 static void 338 p_ere_exp(p) 339 struct parse *p; 340 { 341 char c; 342 wint_t wc; 343 sopno pos; 344 int count; 345 int count2; 346 sopno subno; 347 int wascaret = 0; 348 349 assert(MORE()); /* caller should have ensured this */ 350 c = GETNEXT(); 351 352 pos = HERE(); 353 switch (c) { 354 case '(': 355 (void)REQUIRE(MORE(), REG_EPAREN); 356 p->g->nsub++; 357 subno = p->g->nsub; 358 if (subno < NPAREN) 359 p->pbegin[subno] = HERE(); 360 EMIT(OLPAREN, subno); 361 if (!SEE(')')) 362 p_ere(p, ')'); 363 if (subno < NPAREN) { 364 p->pend[subno] = HERE(); 365 assert(p->pend[subno] != 0); 366 } 367 EMIT(ORPAREN, subno); 368 (void)MUSTEAT(')', REG_EPAREN); 369 break; 370 #ifndef POSIX_MISTAKE 371 case ')': /* happens only if no current unmatched ( */ 372 /* 373 * You may ask, why the ifndef? Because I didn't notice 374 * this until slightly too late for 1003.2, and none of the 375 * other 1003.2 regular-expression reviewers noticed it at 376 * all. So an unmatched ) is legal POSIX, at least until 377 * we can get it fixed. 378 */ 379 SETERROR(REG_EPAREN); 380 break; 381 #endif 382 case '^': 383 EMIT(OBOL, 0); 384 p->g->iflags |= USEBOL; 385 p->g->nbol++; 386 wascaret = 1; 387 break; 388 case '$': 389 EMIT(OEOL, 0); 390 p->g->iflags |= USEEOL; 391 p->g->neol++; 392 break; 393 case '|': 394 SETERROR(REG_EMPTY); 395 break; 396 case '*': 397 case '+': 398 case '?': 399 SETERROR(REG_BADRPT); 400 break; 401 case '.': 402 if (p->g->cflags®_NEWLINE) 403 nonnewline(p); 404 else 405 EMIT(OANY, 0); 406 break; 407 case '[': 408 p_bracket(p); 409 break; 410 case '\\': 411 (void)REQUIRE(MORE(), REG_EESCAPE); 412 wc = WGETNEXT(); 413 ordinary(p, wc); 414 break; 415 case '{': /* okay as ordinary except if digit follows */ 416 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 417 /* FALLTHROUGH */ 418 default: 419 p->next--; 420 wc = WGETNEXT(); 421 ordinary(p, wc); 422 break; 423 } 424 425 if (!MORE()) 426 return; 427 c = PEEK(); 428 /* we call { a repetition if followed by a digit */ 429 if (!( c == '*' || c == '+' || c == '?' || 430 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 431 return; /* no repetition, we're done */ 432 NEXT(); 433 434 (void)REQUIRE(!wascaret, REG_BADRPT); 435 switch (c) { 436 case '*': /* implemented as +? */ 437 /* this case does not require the (y|) trick, noKLUDGE */ 438 INSERT(OPLUS_, pos); 439 ASTERN(O_PLUS, pos); 440 INSERT(OQUEST_, pos); 441 ASTERN(O_QUEST, pos); 442 break; 443 case '+': 444 INSERT(OPLUS_, pos); 445 ASTERN(O_PLUS, pos); 446 break; 447 case '?': 448 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 449 INSERT(OCH_, pos); /* offset slightly wrong */ 450 ASTERN(OOR1, pos); /* this one's right */ 451 AHEAD(pos); /* fix the OCH_ */ 452 EMIT(OOR2, 0); /* offset very wrong... */ 453 AHEAD(THERE()); /* ...so fix it */ 454 ASTERN(O_CH, THERETHERE()); 455 break; 456 case '{': 457 count = p_count(p); 458 if (EAT(',')) { 459 if (isdigit((uch)PEEK())) { 460 count2 = p_count(p); 461 (void)REQUIRE(count <= count2, REG_BADBR); 462 } else /* single number with comma */ 463 count2 = INFINITY; 464 } else /* just a single number */ 465 count2 = count; 466 repeat(p, pos, count, count2); 467 if (!EAT('}')) { /* error heuristics */ 468 while (MORE() && PEEK() != '}') 469 NEXT(); 470 (void)REQUIRE(MORE(), REG_EBRACE); 471 SETERROR(REG_BADBR); 472 } 473 break; 474 } 475 476 if (!MORE()) 477 return; 478 c = PEEK(); 479 if (!( c == '*' || c == '+' || c == '?' || 480 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 481 return; 482 SETERROR(REG_BADRPT); 483 } 484 485 /* 486 - p_str - string (no metacharacters) "parser" 487 == static void p_str(struct parse *p); 488 */ 489 static void 490 p_str(p) 491 struct parse *p; 492 { 493 (void)REQUIRE(MORE(), REG_EMPTY); 494 while (MORE()) 495 ordinary(p, WGETNEXT()); 496 } 497 498 /* 499 - p_bre - BRE parser top level, anchoring and concatenation 500 == static void p_bre(struct parse *p, int end1, \ 501 == int end2); 502 * Giving end1 as OUT essentially eliminates the end1/end2 check. 503 * 504 * This implementation is a bit of a kludge, in that a trailing $ is first 505 * taken as an ordinary character and then revised to be an anchor. 506 * The amount of lookahead needed to avoid this kludge is excessive. 507 */ 508 static void 509 p_bre(p, end1, end2) 510 struct parse *p; 511 int end1; /* first terminating character */ 512 int end2; /* second terminating character */ 513 { 514 sopno start = HERE(); 515 int first = 1; /* first subexpression? */ 516 int wasdollar = 0; 517 518 if (EAT('^')) { 519 EMIT(OBOL, 0); 520 p->g->iflags |= USEBOL; 521 p->g->nbol++; 522 } 523 while (MORE() && !SEETWO(end1, end2)) { 524 wasdollar = p_simp_re(p, first); 525 first = 0; 526 } 527 if (wasdollar) { /* oops, that was a trailing anchor */ 528 DROP(1); 529 EMIT(OEOL, 0); 530 p->g->iflags |= USEEOL; 531 p->g->neol++; 532 } 533 534 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 535 } 536 537 /* 538 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 539 == static int p_simp_re(struct parse *p, int starordinary); 540 */ 541 static int /* was the simple RE an unbackslashed $? */ 542 p_simp_re(p, starordinary) 543 struct parse *p; 544 int starordinary; /* is a leading * an ordinary character? */ 545 { 546 int c; 547 int count; 548 int count2; 549 sopno pos; 550 int i; 551 wint_t wc; 552 sopno subno; 553 # define BACKSL (1<<CHAR_BIT) 554 555 pos = HERE(); /* repetion op, if any, covers from here */ 556 557 assert(MORE()); /* caller should have ensured this */ 558 c = GETNEXT(); 559 if (c == '\\') { 560 (void)REQUIRE(MORE(), REG_EESCAPE); 561 c = BACKSL | GETNEXT(); 562 } 563 switch (c) { 564 case '.': 565 if (p->g->cflags®_NEWLINE) 566 nonnewline(p); 567 else 568 EMIT(OANY, 0); 569 break; 570 case '[': 571 p_bracket(p); 572 break; 573 case BACKSL|'{': 574 SETERROR(REG_BADRPT); 575 break; 576 case BACKSL|'(': 577 p->g->nsub++; 578 subno = p->g->nsub; 579 if (subno < NPAREN) 580 p->pbegin[subno] = HERE(); 581 EMIT(OLPAREN, subno); 582 /* the MORE here is an error heuristic */ 583 if (MORE() && !SEETWO('\\', ')')) 584 p_bre(p, '\\', ')'); 585 if (subno < NPAREN) { 586 p->pend[subno] = HERE(); 587 assert(p->pend[subno] != 0); 588 } 589 EMIT(ORPAREN, subno); 590 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 591 break; 592 case BACKSL|')': /* should not get here -- must be user */ 593 case BACKSL|'}': 594 SETERROR(REG_EPAREN); 595 break; 596 case BACKSL|'1': 597 case BACKSL|'2': 598 case BACKSL|'3': 599 case BACKSL|'4': 600 case BACKSL|'5': 601 case BACKSL|'6': 602 case BACKSL|'7': 603 case BACKSL|'8': 604 case BACKSL|'9': 605 i = (c&~BACKSL) - '0'; 606 assert(i < NPAREN); 607 if (p->pend[i] != 0) { 608 assert(i <= p->g->nsub); 609 EMIT(OBACK_, i); 610 assert(p->pbegin[i] != 0); 611 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 612 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 613 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 614 EMIT(O_BACK, i); 615 } else 616 SETERROR(REG_ESUBREG); 617 p->g->backrefs = 1; 618 break; 619 case '*': 620 (void)REQUIRE(starordinary, REG_BADRPT); 621 /* FALLTHROUGH */ 622 default: 623 p->next--; 624 wc = WGETNEXT(); 625 ordinary(p, wc); 626 break; 627 } 628 629 if (EAT('*')) { /* implemented as +? */ 630 /* this case does not require the (y|) trick, noKLUDGE */ 631 INSERT(OPLUS_, pos); 632 ASTERN(O_PLUS, pos); 633 INSERT(OQUEST_, pos); 634 ASTERN(O_QUEST, pos); 635 } else if (EATTWO('\\', '{')) { 636 count = p_count(p); 637 if (EAT(',')) { 638 if (MORE() && isdigit((uch)PEEK())) { 639 count2 = p_count(p); 640 (void)REQUIRE(count <= count2, REG_BADBR); 641 } else /* single number with comma */ 642 count2 = INFINITY; 643 } else /* just a single number */ 644 count2 = count; 645 repeat(p, pos, count, count2); 646 if (!EATTWO('\\', '}')) { /* error heuristics */ 647 while (MORE() && !SEETWO('\\', '}')) 648 NEXT(); 649 (void)REQUIRE(MORE(), REG_EBRACE); 650 SETERROR(REG_BADBR); 651 } 652 } else if (c == '$') /* $ (but not \$) ends it */ 653 return(1); 654 655 return(0); 656 } 657 658 /* 659 - p_count - parse a repetition count 660 == static int p_count(struct parse *p); 661 */ 662 static int /* the value */ 663 p_count(p) 664 struct parse *p; 665 { 666 int count = 0; 667 int ndigits = 0; 668 669 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 670 count = count*10 + (GETNEXT() - '0'); 671 ndigits++; 672 } 673 674 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 675 return(count); 676 } 677 678 /* 679 - p_bracket - parse a bracketed character list 680 == static void p_bracket(struct parse *p); 681 */ 682 static void 683 p_bracket(p) 684 struct parse *p; 685 { 686 cset *cs; 687 wint_t ch; 688 689 /* Dept of Truly Sickening Special-Case Kludges */ 690 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 691 EMIT(OBOW, 0); 692 NEXTn(6); 693 return; 694 } 695 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 696 EMIT(OEOW, 0); 697 NEXTn(6); 698 return; 699 } 700 701 if ((cs = allocset(p)) == NULL) 702 return; 703 704 if (p->g->cflags®_ICASE) 705 cs->icase = 1; 706 if (EAT('^')) 707 cs->invert = 1; 708 if (EAT(']')) 709 CHadd(p, cs, ']'); 710 else if (EAT('-')) 711 CHadd(p, cs, '-'); 712 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 713 p_b_term(p, cs); 714 if (EAT('-')) 715 CHadd(p, cs, '-'); 716 (void)MUSTEAT(']', REG_EBRACK); 717 718 if (p->error != 0) /* don't mess things up further */ 719 return; 720 721 if (cs->invert && p->g->cflags®_NEWLINE) 722 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 723 724 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 725 ordinary(p, ch); 726 freeset(p, cs); 727 } else 728 EMIT(OANYOF, (int)(cs - p->g->sets)); 729 } 730 731 /* 732 - p_b_term - parse one term of a bracketed character list 733 == static void p_b_term(struct parse *p, cset *cs); 734 */ 735 static void 736 p_b_term(p, cs) 737 struct parse *p; 738 cset *cs; 739 { 740 char c; 741 wint_t start, finish; 742 wint_t i; 743 744 /* classify what we've got */ 745 switch ((MORE()) ? PEEK() : '\0') { 746 case '[': 747 c = (MORE2()) ? PEEK2() : '\0'; 748 break; 749 case '-': 750 SETERROR(REG_ERANGE); 751 return; /* NOTE RETURN */ 752 break; 753 default: 754 c = '\0'; 755 break; 756 } 757 758 switch (c) { 759 case ':': /* character class */ 760 NEXT2(); 761 (void)REQUIRE(MORE(), REG_EBRACK); 762 c = PEEK(); 763 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 764 p_b_cclass(p, cs); 765 (void)REQUIRE(MORE(), REG_EBRACK); 766 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 767 break; 768 case '=': /* equivalence class */ 769 NEXT2(); 770 (void)REQUIRE(MORE(), REG_EBRACK); 771 c = PEEK(); 772 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 773 p_b_eclass(p, cs); 774 (void)REQUIRE(MORE(), REG_EBRACK); 775 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 776 break; 777 default: /* symbol, ordinary character, or range */ 778 start = p_b_symbol(p); 779 if (SEE('-') && MORE2() && PEEK2() != ']') { 780 /* range */ 781 NEXT(); 782 if (EAT('-')) 783 finish = '-'; 784 else 785 finish = p_b_symbol(p); 786 } else 787 finish = start; 788 if (start == finish) 789 CHadd(p, cs, start); 790 else { 791 if (__collate_load_error) { 792 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 793 CHaddrange(p, cs, start, finish); 794 } else { 795 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); 796 for (i = 0; i <= UCHAR_MAX; i++) { 797 if ( __collate_range_cmp(start, i) <= 0 798 && __collate_range_cmp(i, finish) <= 0 799 ) 800 CHadd(p, cs, i); 801 } 802 } 803 } 804 break; 805 } 806 } 807 808 /* 809 - p_b_cclass - parse a character-class name and deal with it 810 == static void p_b_cclass(struct parse *p, cset *cs); 811 */ 812 static void 813 p_b_cclass(p, cs) 814 struct parse *p; 815 cset *cs; 816 { 817 char *sp = p->next; 818 size_t len; 819 wctype_t wct; 820 char clname[16]; 821 822 while (MORE() && isalpha((uch)PEEK())) 823 NEXT(); 824 len = p->next - sp; 825 if (len >= sizeof(clname) - 1) { 826 SETERROR(REG_ECTYPE); 827 return; 828 } 829 memcpy(clname, sp, len); 830 clname[len] = '\0'; 831 if ((wct = wctype(clname)) == 0) { 832 SETERROR(REG_ECTYPE); 833 return; 834 } 835 CHaddtype(p, cs, wct); 836 } 837 838 /* 839 - p_b_eclass - parse an equivalence-class name and deal with it 840 == static void p_b_eclass(struct parse *p, cset *cs); 841 * 842 * This implementation is incomplete. xxx 843 */ 844 static void 845 p_b_eclass(p, cs) 846 struct parse *p; 847 cset *cs; 848 { 849 wint_t c; 850 851 c = p_b_coll_elem(p, '='); 852 CHadd(p, cs, c); 853 } 854 855 /* 856 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 857 == static char p_b_symbol(struct parse *p); 858 */ 859 static wint_t /* value of symbol */ 860 p_b_symbol(p) 861 struct parse *p; 862 { 863 wint_t value; 864 865 (void)REQUIRE(MORE(), REG_EBRACK); 866 if (!EATTWO('[', '.')) 867 return(WGETNEXT()); 868 869 /* collating symbol */ 870 value = p_b_coll_elem(p, '.'); 871 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 872 return(value); 873 } 874 875 /* 876 - p_b_coll_elem - parse a collating-element name and look it up 877 == static char p_b_coll_elem(struct parse *p, int endc); 878 */ 879 static wint_t /* value of collating element */ 880 p_b_coll_elem(p, endc) 881 struct parse *p; 882 wint_t endc; /* name ended by endc,']' */ 883 { 884 char *sp = p->next; 885 struct cname *cp; 886 int len; 887 mbstate_t mbs; 888 wchar_t wc; 889 size_t clen; 890 891 while (MORE() && !SEETWO(endc, ']')) 892 NEXT(); 893 if (!MORE()) { 894 SETERROR(REG_EBRACK); 895 return(0); 896 } 897 len = p->next - sp; 898 for (cp = cnames; cp->name != NULL; cp++) 899 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 900 return(cp->code); /* known name */ 901 memset(&mbs, 0, sizeof(mbs)); 902 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 903 return (wc); /* single character */ 904 else if (clen == (size_t)-1 || clen == (size_t)-2) 905 SETERROR(REG_ILLSEQ); 906 else 907 SETERROR(REG_ECOLLATE); /* neither */ 908 return(0); 909 } 910 911 /* 912 - othercase - return the case counterpart of an alphabetic 913 == static char othercase(int ch); 914 */ 915 static wint_t /* if no counterpart, return ch */ 916 othercase(ch) 917 wint_t ch; 918 { 919 assert(iswalpha(ch)); 920 if (iswupper(ch)) 921 return(towlower(ch)); 922 else if (iswlower(ch)) 923 return(towupper(ch)); 924 else /* peculiar, but could happen */ 925 return(ch); 926 } 927 928 /* 929 - bothcases - emit a dualcase version of a two-case character 930 == static void bothcases(struct parse *p, int ch); 931 * 932 * Boy, is this implementation ever a kludge... 933 */ 934 static void 935 bothcases(p, ch) 936 struct parse *p; 937 wint_t ch; 938 { 939 char *oldnext = p->next; 940 char *oldend = p->end; 941 char bracket[3 + MB_LEN_MAX]; 942 size_t n; 943 mbstate_t mbs; 944 945 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 946 p->next = bracket; 947 memset(&mbs, 0, sizeof(mbs)); 948 n = wcrtomb(bracket, ch, &mbs); 949 assert(n != (size_t)-1); 950 bracket[n] = ']'; 951 bracket[n + 1] = '\0'; 952 p->end = bracket+n+1; 953 p_bracket(p); 954 assert(p->next == p->end); 955 p->next = oldnext; 956 p->end = oldend; 957 } 958 959 /* 960 - ordinary - emit an ordinary character 961 == static void ordinary(struct parse *p, int ch); 962 */ 963 static void 964 ordinary(p, ch) 965 struct parse *p; 966 wint_t ch; 967 { 968 cset *cs; 969 970 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 971 bothcases(p, ch); 972 else if ((ch & OPDMASK) == ch) 973 EMIT(OCHAR, ch); 974 else { 975 /* 976 * Kludge: character is too big to fit into an OCHAR operand. 977 * Emit a singleton set. 978 */ 979 if ((cs = allocset(p)) == NULL) 980 return; 981 CHadd(p, cs, ch); 982 EMIT(OANYOF, (int)(cs - p->g->sets)); 983 } 984 } 985 986 /* 987 - nonnewline - emit REG_NEWLINE version of OANY 988 == static void nonnewline(struct parse *p); 989 * 990 * Boy, is this implementation ever a kludge... 991 */ 992 static void 993 nonnewline(p) 994 struct parse *p; 995 { 996 char *oldnext = p->next; 997 char *oldend = p->end; 998 char bracket[4]; 999 1000 p->next = bracket; 1001 p->end = bracket+3; 1002 bracket[0] = '^'; 1003 bracket[1] = '\n'; 1004 bracket[2] = ']'; 1005 bracket[3] = '\0'; 1006 p_bracket(p); 1007 assert(p->next == bracket+3); 1008 p->next = oldnext; 1009 p->end = oldend; 1010 } 1011 1012 /* 1013 - repeat - generate code for a bounded repetition, recursively if needed 1014 == static void repeat(struct parse *p, sopno start, int from, int to); 1015 */ 1016 static void 1017 repeat(p, start, from, to) 1018 struct parse *p; 1019 sopno start; /* operand from here to end of strip */ 1020 int from; /* repeated from this number */ 1021 int to; /* to this number of times (maybe INFINITY) */ 1022 { 1023 sopno finish = HERE(); 1024 # define N 2 1025 # define INF 3 1026 # define REP(f, t) ((f)*8 + (t)) 1027 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1028 sopno copy; 1029 1030 if (p->error != 0) /* head off possible runaway recursion */ 1031 return; 1032 1033 assert(from <= to); 1034 1035 switch (REP(MAP(from), MAP(to))) { 1036 case REP(0, 0): /* must be user doing this */ 1037 DROP(finish-start); /* drop the operand */ 1038 break; 1039 case REP(0, 1): /* as x{1,1}? */ 1040 case REP(0, N): /* as x{1,n}? */ 1041 case REP(0, INF): /* as x{1,}? */ 1042 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1043 INSERT(OCH_, start); /* offset is wrong... */ 1044 repeat(p, start+1, 1, to); 1045 ASTERN(OOR1, start); 1046 AHEAD(start); /* ... fix it */ 1047 EMIT(OOR2, 0); 1048 AHEAD(THERE()); 1049 ASTERN(O_CH, THERETHERE()); 1050 break; 1051 case REP(1, 1): /* trivial case */ 1052 /* done */ 1053 break; 1054 case REP(1, N): /* as x?x{1,n-1} */ 1055 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1056 INSERT(OCH_, start); 1057 ASTERN(OOR1, start); 1058 AHEAD(start); 1059 EMIT(OOR2, 0); /* offset very wrong... */ 1060 AHEAD(THERE()); /* ...so fix it */ 1061 ASTERN(O_CH, THERETHERE()); 1062 copy = dupl(p, start+1, finish+1); 1063 assert(copy == finish+4); 1064 repeat(p, copy, 1, to-1); 1065 break; 1066 case REP(1, INF): /* as x+ */ 1067 INSERT(OPLUS_, start); 1068 ASTERN(O_PLUS, start); 1069 break; 1070 case REP(N, N): /* as xx{m-1,n-1} */ 1071 copy = dupl(p, start, finish); 1072 repeat(p, copy, from-1, to-1); 1073 break; 1074 case REP(N, INF): /* as xx{n-1,INF} */ 1075 copy = dupl(p, start, finish); 1076 repeat(p, copy, from-1, to); 1077 break; 1078 default: /* "can't happen" */ 1079 SETERROR(REG_ASSERT); /* just in case */ 1080 break; 1081 } 1082 } 1083 1084 /* 1085 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1086 - character from the parse struct, signals a REG_ILLSEQ error if the 1087 - character can't be converted. Returns the number of bytes consumed. 1088 */ 1089 static wint_t 1090 wgetnext(p) 1091 struct parse *p; 1092 { 1093 mbstate_t mbs; 1094 wchar_t wc; 1095 size_t n; 1096 1097 memset(&mbs, 0, sizeof(mbs)); 1098 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1099 if (n == (size_t)-1 || n == (size_t)-2) { 1100 SETERROR(REG_ILLSEQ); 1101 return (0); 1102 } 1103 if (n == 0) 1104 n = 1; 1105 p->next += n; 1106 return (wc); 1107 } 1108 1109 /* 1110 - seterr - set an error condition 1111 == static int seterr(struct parse *p, int e); 1112 */ 1113 static int /* useless but makes type checking happy */ 1114 seterr(p, e) 1115 struct parse *p; 1116 int e; 1117 { 1118 if (p->error == 0) /* keep earliest error condition */ 1119 p->error = e; 1120 p->next = nuls; /* try to bring things to a halt */ 1121 p->end = nuls; 1122 return(0); /* make the return value well-defined */ 1123 } 1124 1125 /* 1126 - allocset - allocate a set of characters for [] 1127 == static cset *allocset(struct parse *p); 1128 */ 1129 static cset * 1130 allocset(p) 1131 struct parse *p; 1132 { 1133 cset *cs, *ncs; 1134 1135 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); 1136 if (ncs == NULL) { 1137 SETERROR(REG_ESPACE); 1138 return (NULL); 1139 } 1140 p->g->sets = ncs; 1141 cs = &p->g->sets[p->g->ncsets++]; 1142 memset(cs, 0, sizeof(*cs)); 1143 1144 return(cs); 1145 } 1146 1147 /* 1148 - freeset - free a now-unused set 1149 == static void freeset(struct parse *p, cset *cs); 1150 */ 1151 static void 1152 freeset(p, cs) 1153 struct parse *p; 1154 cset *cs; 1155 { 1156 cset *top = &p->g->sets[p->g->ncsets]; 1157 1158 free(cs->wides); 1159 free(cs->ranges); 1160 free(cs->types); 1161 memset(cs, 0, sizeof(*cs)); 1162 if (cs == top-1) /* recover only the easy case */ 1163 p->g->ncsets--; 1164 } 1165 1166 /* 1167 - singleton - Determine whether a set contains only one character, 1168 - returning it if so, otherwise returning OUT. 1169 */ 1170 static wint_t 1171 singleton(cs) 1172 cset *cs; 1173 { 1174 wint_t i, s, n; 1175 1176 for (i = n = 0; i < NC; i++) 1177 if (CHIN(cs, i)) { 1178 n++; 1179 s = i; 1180 } 1181 if (n == 1) 1182 return (s); 1183 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1184 cs->icase == 0) 1185 return (cs->wides[0]); 1186 /* Don't bother handling the other cases. */ 1187 return (OUT); 1188 } 1189 1190 /* 1191 - CHadd - add character to character set. 1192 */ 1193 static void 1194 CHadd(p, cs, ch) 1195 struct parse *p; 1196 cset *cs; 1197 wint_t ch; 1198 { 1199 wint_t nch, *newwides; 1200 assert(ch >= 0); 1201 if (ch < NC) 1202 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1203 else { 1204 newwides = realloc(cs->wides, (cs->nwides + 1) * 1205 sizeof(*cs->wides)); 1206 if (newwides == NULL) { 1207 SETERROR(REG_ESPACE); 1208 return; 1209 } 1210 cs->wides = newwides; 1211 cs->wides[cs->nwides++] = ch; 1212 } 1213 if (cs->icase) { 1214 if ((nch = towlower(ch)) < NC) 1215 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1216 if ((nch = towupper(ch)) < NC) 1217 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1218 } 1219 } 1220 1221 /* 1222 - CHaddrange - add all characters in the range [min,max] to a character set. 1223 */ 1224 static void 1225 CHaddrange(p, cs, min, max) 1226 struct parse *p; 1227 cset *cs; 1228 wint_t min, max; 1229 { 1230 crange *newranges; 1231 1232 for (; min < NC && min <= max; min++) 1233 CHadd(p, cs, min); 1234 if (min >= max) 1235 return; 1236 newranges = realloc(cs->ranges, (cs->nranges + 1) * 1237 sizeof(*cs->ranges)); 1238 if (newranges == NULL) { 1239 SETERROR(REG_ESPACE); 1240 return; 1241 } 1242 cs->ranges = newranges; 1243 cs->ranges[cs->nranges].min = min; 1244 cs->ranges[cs->nranges].min = max; 1245 cs->nranges++; 1246 } 1247 1248 /* 1249 - CHaddtype - add all characters of a certain type to a character set. 1250 */ 1251 static void 1252 CHaddtype(p, cs, wct) 1253 struct parse *p; 1254 cset *cs; 1255 wctype_t wct; 1256 { 1257 wint_t i; 1258 wctype_t *newtypes; 1259 1260 for (i = 0; i < NC; i++) 1261 if (iswctype(i, wct)) 1262 CHadd(p, cs, i); 1263 newtypes = realloc(cs->types, (cs->ntypes + 1) * 1264 sizeof(*cs->types)); 1265 if (newtypes == NULL) { 1266 SETERROR(REG_ESPACE); 1267 return; 1268 } 1269 cs->types = newtypes; 1270 cs->types[cs->ntypes++] = wct; 1271 } 1272 1273 /* 1274 - dupl - emit a duplicate of a bunch of sops 1275 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1276 */ 1277 static sopno /* start of duplicate */ 1278 dupl(p, start, finish) 1279 struct parse *p; 1280 sopno start; /* from here */ 1281 sopno finish; /* to this less one */ 1282 { 1283 sopno ret = HERE(); 1284 sopno len = finish - start; 1285 1286 assert(finish >= start); 1287 if (len == 0) 1288 return(ret); 1289 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1290 assert(p->ssize >= p->slen + len); 1291 (void) memcpy((char *)(p->strip + p->slen), 1292 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1293 p->slen += len; 1294 return(ret); 1295 } 1296 1297 /* 1298 - doemit - emit a strip operator 1299 == static void doemit(struct parse *p, sop op, size_t opnd); 1300 * 1301 * It might seem better to implement this as a macro with a function as 1302 * hard-case backup, but it's just too big and messy unless there are 1303 * some changes to the data structures. Maybe later. 1304 */ 1305 static void 1306 doemit(p, op, opnd) 1307 struct parse *p; 1308 sop op; 1309 size_t opnd; 1310 { 1311 /* avoid making error situations worse */ 1312 if (p->error != 0) 1313 return; 1314 1315 /* deal with oversize operands ("can't happen", more or less) */ 1316 assert(opnd < 1<<OPSHIFT); 1317 1318 /* deal with undersized strip */ 1319 if (p->slen >= p->ssize) 1320 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1321 assert(p->slen < p->ssize); 1322 1323 /* finally, it's all reduced to the easy case */ 1324 p->strip[p->slen++] = SOP(op, opnd); 1325 } 1326 1327 /* 1328 - doinsert - insert a sop into the strip 1329 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1330 */ 1331 static void 1332 doinsert(p, op, opnd, pos) 1333 struct parse *p; 1334 sop op; 1335 size_t opnd; 1336 sopno pos; 1337 { 1338 sopno sn; 1339 sop s; 1340 int i; 1341 1342 /* avoid making error situations worse */ 1343 if (p->error != 0) 1344 return; 1345 1346 sn = HERE(); 1347 EMIT(op, opnd); /* do checks, ensure space */ 1348 assert(HERE() == sn+1); 1349 s = p->strip[sn]; 1350 1351 /* adjust paren pointers */ 1352 assert(pos > 0); 1353 for (i = 1; i < NPAREN; i++) { 1354 if (p->pbegin[i] >= pos) { 1355 p->pbegin[i]++; 1356 } 1357 if (p->pend[i] >= pos) { 1358 p->pend[i]++; 1359 } 1360 } 1361 1362 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1363 (HERE()-pos-1)*sizeof(sop)); 1364 p->strip[pos] = s; 1365 } 1366 1367 /* 1368 - dofwd - complete a forward reference 1369 == static void dofwd(struct parse *p, sopno pos, sop value); 1370 */ 1371 static void 1372 dofwd(p, pos, value) 1373 struct parse *p; 1374 sopno pos; 1375 sop value; 1376 { 1377 /* avoid making error situations worse */ 1378 if (p->error != 0) 1379 return; 1380 1381 assert(value < 1<<OPSHIFT); 1382 p->strip[pos] = OP(p->strip[pos]) | value; 1383 } 1384 1385 /* 1386 - enlarge - enlarge the strip 1387 == static void enlarge(struct parse *p, sopno size); 1388 */ 1389 static void 1390 enlarge(p, size) 1391 struct parse *p; 1392 sopno size; 1393 { 1394 sop *sp; 1395 1396 if (p->ssize >= size) 1397 return; 1398 1399 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1400 if (sp == NULL) { 1401 SETERROR(REG_ESPACE); 1402 return; 1403 } 1404 p->strip = sp; 1405 p->ssize = size; 1406 } 1407 1408 /* 1409 - stripsnug - compact the strip 1410 == static void stripsnug(struct parse *p, struct re_guts *g); 1411 */ 1412 static void 1413 stripsnug(p, g) 1414 struct parse *p; 1415 struct re_guts *g; 1416 { 1417 g->nstates = p->slen; 1418 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1419 if (g->strip == NULL) { 1420 SETERROR(REG_ESPACE); 1421 g->strip = p->strip; 1422 } 1423 } 1424 1425 /* 1426 - findmust - fill in must and mlen with longest mandatory literal string 1427 == static void findmust(struct parse *p, struct re_guts *g); 1428 * 1429 * This algorithm could do fancy things like analyzing the operands of | 1430 * for common subsequences. Someday. This code is simple and finds most 1431 * of the interesting cases. 1432 * 1433 * Note that must and mlen got initialized during setup. 1434 */ 1435 static void 1436 findmust(p, g) 1437 struct parse *p; 1438 struct re_guts *g; 1439 { 1440 sop *scan; 1441 sop *start; 1442 sop *newstart; 1443 sopno newlen; 1444 sop s; 1445 char *cp; 1446 int offset; 1447 char buf[MB_LEN_MAX]; 1448 size_t clen; 1449 mbstate_t mbs; 1450 1451 /* avoid making error situations worse */ 1452 if (p->error != 0) 1453 return; 1454 1455 /* 1456 * It's not generally safe to do a ``char'' substring search on 1457 * multibyte character strings, but it's safe for at least 1458 * UTF-8 (see RFC 3629). 1459 */ 1460 if (MB_CUR_MAX > 1 && 1461 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1462 return; 1463 1464 /* find the longest OCHAR sequence in strip */ 1465 newlen = 0; 1466 offset = 0; 1467 g->moffset = 0; 1468 scan = g->strip + 1; 1469 do { 1470 s = *scan++; 1471 switch (OP(s)) { 1472 case OCHAR: /* sequence member */ 1473 if (newlen == 0) { /* new sequence */ 1474 memset(&mbs, 0, sizeof(mbs)); 1475 newstart = scan - 1; 1476 } 1477 clen = wcrtomb(buf, OPND(s), &mbs); 1478 if (clen == (size_t)-1) 1479 goto toohard; 1480 newlen += clen; 1481 break; 1482 case OPLUS_: /* things that don't break one */ 1483 case OLPAREN: 1484 case ORPAREN: 1485 break; 1486 case OQUEST_: /* things that must be skipped */ 1487 case OCH_: 1488 offset = altoffset(scan, offset); 1489 scan--; 1490 do { 1491 scan += OPND(s); 1492 s = *scan; 1493 /* assert() interferes w debug printouts */ 1494 if (OP(s) != O_QUEST && OP(s) != O_CH && 1495 OP(s) != OOR2) { 1496 g->iflags |= BAD; 1497 return; 1498 } 1499 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1500 /* FALLTHROUGH */ 1501 case OBOW: /* things that break a sequence */ 1502 case OEOW: 1503 case OBOL: 1504 case OEOL: 1505 case O_QUEST: 1506 case O_CH: 1507 case OEND: 1508 if (newlen > g->mlen) { /* ends one */ 1509 start = newstart; 1510 g->mlen = newlen; 1511 if (offset > -1) { 1512 g->moffset += offset; 1513 offset = newlen; 1514 } else 1515 g->moffset = offset; 1516 } else { 1517 if (offset > -1) 1518 offset += newlen; 1519 } 1520 newlen = 0; 1521 break; 1522 case OANY: 1523 if (newlen > g->mlen) { /* ends one */ 1524 start = newstart; 1525 g->mlen = newlen; 1526 if (offset > -1) { 1527 g->moffset += offset; 1528 offset = newlen; 1529 } else 1530 g->moffset = offset; 1531 } else { 1532 if (offset > -1) 1533 offset += newlen; 1534 } 1535 if (offset > -1) 1536 offset++; 1537 newlen = 0; 1538 break; 1539 case OANYOF: /* may or may not invalidate offset */ 1540 /* First, everything as OANY */ 1541 if (newlen > g->mlen) { /* ends one */ 1542 start = newstart; 1543 g->mlen = newlen; 1544 if (offset > -1) { 1545 g->moffset += offset; 1546 offset = newlen; 1547 } else 1548 g->moffset = offset; 1549 } else { 1550 if (offset > -1) 1551 offset += newlen; 1552 } 1553 if (offset > -1) 1554 offset++; 1555 newlen = 0; 1556 break; 1557 toohard: 1558 default: 1559 /* Anything here makes it impossible or too hard 1560 * to calculate the offset -- so we give up; 1561 * save the last known good offset, in case the 1562 * must sequence doesn't occur later. 1563 */ 1564 if (newlen > g->mlen) { /* ends one */ 1565 start = newstart; 1566 g->mlen = newlen; 1567 if (offset > -1) 1568 g->moffset += offset; 1569 else 1570 g->moffset = offset; 1571 } 1572 offset = -1; 1573 newlen = 0; 1574 break; 1575 } 1576 } while (OP(s) != OEND); 1577 1578 if (g->mlen == 0) { /* there isn't one */ 1579 g->moffset = -1; 1580 return; 1581 } 1582 1583 /* turn it into a character string */ 1584 g->must = malloc((size_t)g->mlen + 1); 1585 if (g->must == NULL) { /* argh; just forget it */ 1586 g->mlen = 0; 1587 g->moffset = -1; 1588 return; 1589 } 1590 cp = g->must; 1591 scan = start; 1592 memset(&mbs, 0, sizeof(mbs)); 1593 while (cp < g->must + g->mlen) { 1594 while (OP(s = *scan++) != OCHAR) 1595 continue; 1596 clen = wcrtomb(cp, OPND(s), &mbs); 1597 assert(clen != (size_t)-1); 1598 cp += clen; 1599 } 1600 assert(cp == g->must + g->mlen); 1601 *cp++ = '\0'; /* just on general principles */ 1602 } 1603 1604 /* 1605 - altoffset - choose biggest offset among multiple choices 1606 == static int altoffset(sop *scan, int offset); 1607 * 1608 * Compute, recursively if necessary, the largest offset among multiple 1609 * re paths. 1610 */ 1611 static int 1612 altoffset(scan, offset) 1613 sop *scan; 1614 int offset; 1615 { 1616 int largest; 1617 int try; 1618 sop s; 1619 1620 /* If we gave up already on offsets, return */ 1621 if (offset == -1) 1622 return -1; 1623 1624 largest = 0; 1625 try = 0; 1626 s = *scan++; 1627 while (OP(s) != O_QUEST && OP(s) != O_CH) { 1628 switch (OP(s)) { 1629 case OOR1: 1630 if (try > largest) 1631 largest = try; 1632 try = 0; 1633 break; 1634 case OQUEST_: 1635 case OCH_: 1636 try = altoffset(scan, try); 1637 if (try == -1) 1638 return -1; 1639 scan--; 1640 do { 1641 scan += OPND(s); 1642 s = *scan; 1643 if (OP(s) != O_QUEST && OP(s) != O_CH && 1644 OP(s) != OOR2) 1645 return -1; 1646 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1647 /* We must skip to the next position, or we'll 1648 * leave altoffset() too early. 1649 */ 1650 scan++; 1651 break; 1652 case OANYOF: 1653 case OCHAR: 1654 case OANY: 1655 try++; 1656 case OBOW: 1657 case OEOW: 1658 case OLPAREN: 1659 case ORPAREN: 1660 case OOR2: 1661 break; 1662 default: 1663 try = -1; 1664 break; 1665 } 1666 if (try == -1) 1667 return -1; 1668 s = *scan++; 1669 } 1670 1671 if (try > largest) 1672 largest = try; 1673 1674 return largest+offset; 1675 } 1676 1677 /* 1678 - computejumps - compute char jumps for BM scan 1679 == static void computejumps(struct parse *p, struct re_guts *g); 1680 * 1681 * This algorithm assumes g->must exists and is has size greater than 1682 * zero. It's based on the algorithm found on Computer Algorithms by 1683 * Sara Baase. 1684 * 1685 * A char jump is the number of characters one needs to jump based on 1686 * the value of the character from the text that was mismatched. 1687 */ 1688 static void 1689 computejumps(p, g) 1690 struct parse *p; 1691 struct re_guts *g; 1692 { 1693 int ch; 1694 int mindex; 1695 1696 /* Avoid making errors worse */ 1697 if (p->error != 0) 1698 return; 1699 1700 g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 1701 if (g->charjump == NULL) /* Not a fatal error */ 1702 return; 1703 /* Adjust for signed chars, if necessary */ 1704 g->charjump = &g->charjump[-(CHAR_MIN)]; 1705 1706 /* If the character does not exist in the pattern, the jump 1707 * is equal to the number of characters in the pattern. 1708 */ 1709 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 1710 g->charjump[ch] = g->mlen; 1711 1712 /* If the character does exist, compute the jump that would 1713 * take us to the last character in the pattern equal to it 1714 * (notice that we match right to left, so that last character 1715 * is the first one that would be matched). 1716 */ 1717 for (mindex = 0; mindex < g->mlen; mindex++) 1718 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 1719 } 1720 1721 /* 1722 - computematchjumps - compute match jumps for BM scan 1723 == static void computematchjumps(struct parse *p, struct re_guts *g); 1724 * 1725 * This algorithm assumes g->must exists and is has size greater than 1726 * zero. It's based on the algorithm found on Computer Algorithms by 1727 * Sara Baase. 1728 * 1729 * A match jump is the number of characters one needs to advance based 1730 * on the already-matched suffix. 1731 * Notice that all values here are minus (g->mlen-1), because of the way 1732 * the search algorithm works. 1733 */ 1734 static void 1735 computematchjumps(p, g) 1736 struct parse *p; 1737 struct re_guts *g; 1738 { 1739 int mindex; /* General "must" iterator */ 1740 int suffix; /* Keeps track of matching suffix */ 1741 int ssuffix; /* Keeps track of suffixes' suffix */ 1742 int* pmatches; /* pmatches[k] points to the next i 1743 * such that i+1...mlen is a substring 1744 * of k+1...k+mlen-i-1 1745 */ 1746 1747 /* Avoid making errors worse */ 1748 if (p->error != 0) 1749 return; 1750 1751 pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); 1752 if (pmatches == NULL) { 1753 g->matchjump = NULL; 1754 return; 1755 } 1756 1757 g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); 1758 if (g->matchjump == NULL) /* Not a fatal error */ 1759 return; 1760 1761 /* Set maximum possible jump for each character in the pattern */ 1762 for (mindex = 0; mindex < g->mlen; mindex++) 1763 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 1764 1765 /* Compute pmatches[] */ 1766 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 1767 mindex--, suffix--) { 1768 pmatches[mindex] = suffix; 1769 1770 /* If a mismatch is found, interrupting the substring, 1771 * compute the matchjump for that position. If no 1772 * mismatch is found, then a text substring mismatched 1773 * against the suffix will also mismatch against the 1774 * substring. 1775 */ 1776 while (suffix < g->mlen 1777 && g->must[mindex] != g->must[suffix]) { 1778 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1779 g->mlen - mindex - 1); 1780 suffix = pmatches[suffix]; 1781 } 1782 } 1783 1784 /* Compute the matchjump up to the last substring found to jump 1785 * to the beginning of the largest must pattern prefix matching 1786 * it's own suffix. 1787 */ 1788 for (mindex = 0; mindex <= suffix; mindex++) 1789 g->matchjump[mindex] = MIN(g->matchjump[mindex], 1790 g->mlen + suffix - mindex); 1791 1792 ssuffix = pmatches[suffix]; 1793 while (suffix < g->mlen) { 1794 while (suffix <= ssuffix && suffix < g->mlen) { 1795 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1796 g->mlen + ssuffix - suffix); 1797 suffix++; 1798 } 1799 if (suffix < g->mlen) 1800 ssuffix = pmatches[ssuffix]; 1801 } 1802 1803 free(pmatches); 1804 } 1805 1806 /* 1807 - pluscount - count + nesting 1808 == static sopno pluscount(struct parse *p, struct re_guts *g); 1809 */ 1810 static sopno /* nesting depth */ 1811 pluscount(p, g) 1812 struct parse *p; 1813 struct re_guts *g; 1814 { 1815 sop *scan; 1816 sop s; 1817 sopno plusnest = 0; 1818 sopno maxnest = 0; 1819 1820 if (p->error != 0) 1821 return(0); /* there may not be an OEND */ 1822 1823 scan = g->strip + 1; 1824 do { 1825 s = *scan++; 1826 switch (OP(s)) { 1827 case OPLUS_: 1828 plusnest++; 1829 break; 1830 case O_PLUS: 1831 if (plusnest > maxnest) 1832 maxnest = plusnest; 1833 plusnest--; 1834 break; 1835 } 1836 } while (OP(s) != OEND); 1837 if (plusnest != 0) 1838 g->iflags |= BAD; 1839 return(maxnest); 1840 } 1841