1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 34 */ 35 36 #if defined(LIBC_SCCS) && !defined(lint) 37 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 38 #endif /* LIBC_SCCS and not lint */ 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include <sys/types.h> 43 #include <stdio.h> 44 #include <string.h> 45 #include <ctype.h> 46 #include <limits.h> 47 #include <stdlib.h> 48 #include <regex.h> 49 #include <runetype.h> 50 #include <wchar.h> 51 #include <wctype.h> 52 53 #include "collate.h" 54 55 #include "utils.h" 56 #include "regex2.h" 57 58 #include "cname.h" 59 60 /* 61 * parse structure, passed up and down to avoid global variables and 62 * other clumsinesses 63 */ 64 struct parse { 65 char *next; /* next character in RE */ 66 char *end; /* end of string (-> NUL normally) */ 67 int error; /* has an error been seen? */ 68 sop *strip; /* malloced strip */ 69 sopno ssize; /* malloced strip size (allocated) */ 70 sopno slen; /* malloced strip length (used) */ 71 int ncsalloc; /* number of csets allocated */ 72 struct re_guts *g; 73 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 74 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 75 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 76 }; 77 78 /* ========= begin header generated by ./mkh ========= */ 79 #ifdef __cplusplus 80 extern "C" { 81 #endif 82 83 /* === regcomp.c === */ 84 static void p_ere(struct parse *p, wint_t stop); 85 static void p_ere_exp(struct parse *p); 86 static void p_str(struct parse *p); 87 static void p_bre(struct parse *p, wint_t end1, wint_t end2); 88 static int p_simp_re(struct parse *p, int starordinary); 89 static int p_count(struct parse *p); 90 static void p_bracket(struct parse *p); 91 static void p_b_term(struct parse *p, cset *cs); 92 static void p_b_cclass(struct parse *p, cset *cs); 93 static void p_b_eclass(struct parse *p, cset *cs); 94 static wint_t p_b_symbol(struct parse *p); 95 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 96 static wint_t othercase(wint_t ch); 97 static void bothcases(struct parse *p, wint_t ch); 98 static void ordinary(struct parse *p, wint_t ch); 99 static void nonnewline(struct parse *p); 100 static void repeat(struct parse *p, sopno start, int from, int to); 101 static int seterr(struct parse *p, int e); 102 static cset *allocset(struct parse *p); 103 static void freeset(struct parse *p, cset *cs); 104 static void CHadd(struct parse *p, cset *cs, wint_t ch); 105 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 106 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 107 static wint_t singleton(cset *cs); 108 static sopno dupl(struct parse *p, sopno start, sopno finish); 109 static void doemit(struct parse *p, sop op, size_t opnd); 110 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 111 static void dofwd(struct parse *p, sopno pos, sop value); 112 static void enlarge(struct parse *p, sopno size); 113 static void stripsnug(struct parse *p, struct re_guts *g); 114 static void findmust(struct parse *p, struct re_guts *g); 115 static int altoffset(sop *scan, int offset); 116 static void computejumps(struct parse *p, struct re_guts *g); 117 static void computematchjumps(struct parse *p, struct re_guts *g); 118 static sopno pluscount(struct parse *p, struct re_guts *g); 119 static wint_t wgetnext(struct parse *p); 120 121 #ifdef __cplusplus 122 } 123 #endif 124 /* ========= end header generated by ./mkh ========= */ 125 126 static char nuls[10]; /* place to point scanner in event of error */ 127 128 /* 129 * macros for use with parse structure 130 * BEWARE: these know that the parse structure is named `p' !!! 131 */ 132 #define PEEK() (*p->next) 133 #define PEEK2() (*(p->next+1)) 134 #define MORE() (p->next < p->end) 135 #define MORE2() (p->next+1 < p->end) 136 #define SEE(c) (MORE() && PEEK() == (c)) 137 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 138 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 139 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 140 #define NEXT() (p->next++) 141 #define NEXT2() (p->next += 2) 142 #define NEXTn(n) (p->next += (n)) 143 #define GETNEXT() (*p->next++) 144 #define WGETNEXT() wgetnext(p) 145 #define SETERROR(e) seterr(p, (e)) 146 #define REQUIRE(co, e) ((co) || SETERROR(e)) 147 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 148 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 149 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 150 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 151 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 152 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 153 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 154 #define HERE() (p->slen) 155 #define THERE() (p->slen - 1) 156 #define THERETHERE() (p->slen - 2) 157 #define DROP(n) (p->slen -= (n)) 158 159 #ifndef NDEBUG 160 static int never = 0; /* for use in asserts; shuts lint up */ 161 #else 162 #define never 0 /* some <assert.h>s have bugs too */ 163 #endif 164 165 /* Macro used by computejump()/computematchjump() */ 166 #define MIN(a,b) ((a)<(b)?(a):(b)) 167 168 /* 169 - regcomp - interface for parser and compilation 170 = extern int regcomp(regex_t *, const char *, int); 171 = #define REG_BASIC 0000 172 = #define REG_EXTENDED 0001 173 = #define REG_ICASE 0002 174 = #define REG_NOSUB 0004 175 = #define REG_NEWLINE 0010 176 = #define REG_NOSPEC 0020 177 = #define REG_PEND 0040 178 = #define REG_DUMP 0200 179 */ 180 int /* 0 success, otherwise REG_something */ 181 regcomp(regex_t * __restrict preg, 182 const char * __restrict pattern, 183 int cflags) 184 { 185 struct parse pa; 186 struct re_guts *g; 187 struct parse *p = &pa; 188 int i; 189 size_t len; 190 #ifdef REDEBUG 191 # define GOODFLAGS(f) (f) 192 #else 193 # define GOODFLAGS(f) ((f)&~REG_DUMP) 194 #endif 195 196 cflags = GOODFLAGS(cflags); 197 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 198 return(REG_INVARG); 199 200 if (cflags®_PEND) { 201 if (preg->re_endp < pattern) 202 return(REG_INVARG); 203 len = preg->re_endp - pattern; 204 } else 205 len = strlen((char *)pattern); 206 207 /* do the mallocs early so failure handling is easy */ 208 g = (struct re_guts *)malloc(sizeof(struct re_guts)); 209 if (g == NULL) 210 return(REG_ESPACE); 211 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 212 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 213 p->slen = 0; 214 if (p->strip == NULL) { 215 free((char *)g); 216 return(REG_ESPACE); 217 } 218 219 /* set things up */ 220 p->g = g; 221 p->next = (char *)pattern; /* convenience; we do not modify it */ 222 p->end = p->next + len; 223 p->error = 0; 224 p->ncsalloc = 0; 225 for (i = 0; i < NPAREN; i++) { 226 p->pbegin[i] = 0; 227 p->pend[i] = 0; 228 } 229 g->sets = NULL; 230 g->ncsets = 0; 231 g->cflags = cflags; 232 g->iflags = 0; 233 g->nbol = 0; 234 g->neol = 0; 235 g->must = NULL; 236 g->moffset = -1; 237 g->charjump = NULL; 238 g->matchjump = NULL; 239 g->mlen = 0; 240 g->nsub = 0; 241 g->backrefs = 0; 242 243 /* do it */ 244 EMIT(OEND, 0); 245 g->firststate = THERE(); 246 if (cflags®_EXTENDED) 247 p_ere(p, OUT); 248 else if (cflags®_NOSPEC) 249 p_str(p); 250 else 251 p_bre(p, OUT, OUT); 252 EMIT(OEND, 0); 253 g->laststate = THERE(); 254 255 /* tidy up loose ends and fill things in */ 256 stripsnug(p, g); 257 findmust(p, g); 258 /* only use Boyer-Moore algorithm if the pattern is bigger 259 * than three characters 260 */ 261 if(g->mlen > 3) { 262 computejumps(p, g); 263 computematchjumps(p, g); 264 if(g->matchjump == NULL && g->charjump != NULL) { 265 free(g->charjump); 266 g->charjump = NULL; 267 } 268 } 269 g->nplus = pluscount(p, g); 270 g->magic = MAGIC2; 271 preg->re_nsub = g->nsub; 272 preg->re_g = g; 273 preg->re_magic = MAGIC1; 274 #ifndef REDEBUG 275 /* not debugging, so can't rely on the assert() in regexec() */ 276 if (g->iflags&BAD) 277 SETERROR(REG_ASSERT); 278 #endif 279 280 /* win or lose, we're done */ 281 if (p->error != 0) /* lose */ 282 regfree(preg); 283 return(p->error); 284 } 285 286 /* 287 - p_ere - ERE parser top level, concatenation and alternation 288 == static void p_ere(struct parse *p, int stop); 289 */ 290 static void 291 p_ere(struct parse *p, 292 int stop) /* character this ERE should end at */ 293 { 294 char c; 295 sopno prevback; 296 sopno prevfwd; 297 sopno conc; 298 int first = 1; /* is this the first alternative? */ 299 300 for (;;) { 301 /* do a bunch of concatenated expressions */ 302 conc = HERE(); 303 while (MORE() && (c = PEEK()) != '|' && c != stop) 304 p_ere_exp(p); 305 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 306 307 if (!EAT('|')) 308 break; /* NOTE BREAK OUT */ 309 310 if (first) { 311 INSERT(OCH_, conc); /* offset is wrong */ 312 prevfwd = conc; 313 prevback = conc; 314 first = 0; 315 } 316 ASTERN(OOR1, prevback); 317 prevback = THERE(); 318 AHEAD(prevfwd); /* fix previous offset */ 319 prevfwd = HERE(); 320 EMIT(OOR2, 0); /* offset is very wrong */ 321 } 322 323 if (!first) { /* tail-end fixups */ 324 AHEAD(prevfwd); 325 ASTERN(O_CH, prevback); 326 } 327 328 assert(!MORE() || SEE(stop)); 329 } 330 331 /* 332 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 333 == static void p_ere_exp(struct parse *p); 334 */ 335 static void 336 p_ere_exp(struct parse *p) 337 { 338 char c; 339 wint_t wc; 340 sopno pos; 341 int count; 342 int count2; 343 sopno subno; 344 int wascaret = 0; 345 346 assert(MORE()); /* caller should have ensured this */ 347 c = GETNEXT(); 348 349 pos = HERE(); 350 switch (c) { 351 case '(': 352 (void)REQUIRE(MORE(), REG_EPAREN); 353 p->g->nsub++; 354 subno = p->g->nsub; 355 if (subno < NPAREN) 356 p->pbegin[subno] = HERE(); 357 EMIT(OLPAREN, subno); 358 if (!SEE(')')) 359 p_ere(p, ')'); 360 if (subno < NPAREN) { 361 p->pend[subno] = HERE(); 362 assert(p->pend[subno] != 0); 363 } 364 EMIT(ORPAREN, subno); 365 (void)MUSTEAT(')', REG_EPAREN); 366 break; 367 #ifndef POSIX_MISTAKE 368 case ')': /* happens only if no current unmatched ( */ 369 /* 370 * You may ask, why the ifndef? Because I didn't notice 371 * this until slightly too late for 1003.2, and none of the 372 * other 1003.2 regular-expression reviewers noticed it at 373 * all. So an unmatched ) is legal POSIX, at least until 374 * we can get it fixed. 375 */ 376 SETERROR(REG_EPAREN); 377 break; 378 #endif 379 case '^': 380 EMIT(OBOL, 0); 381 p->g->iflags |= USEBOL; 382 p->g->nbol++; 383 wascaret = 1; 384 break; 385 case '$': 386 EMIT(OEOL, 0); 387 p->g->iflags |= USEEOL; 388 p->g->neol++; 389 break; 390 case '|': 391 SETERROR(REG_EMPTY); 392 break; 393 case '*': 394 case '+': 395 case '?': 396 SETERROR(REG_BADRPT); 397 break; 398 case '.': 399 if (p->g->cflags®_NEWLINE) 400 nonnewline(p); 401 else 402 EMIT(OANY, 0); 403 break; 404 case '[': 405 p_bracket(p); 406 break; 407 case '\\': 408 (void)REQUIRE(MORE(), REG_EESCAPE); 409 wc = WGETNEXT(); 410 ordinary(p, wc); 411 break; 412 case '{': /* okay as ordinary except if digit follows */ 413 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 414 /* FALLTHROUGH */ 415 default: 416 p->next--; 417 wc = WGETNEXT(); 418 ordinary(p, wc); 419 break; 420 } 421 422 if (!MORE()) 423 return; 424 c = PEEK(); 425 /* we call { a repetition if followed by a digit */ 426 if (!( c == '*' || c == '+' || c == '?' || 427 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 428 return; /* no repetition, we're done */ 429 NEXT(); 430 431 (void)REQUIRE(!wascaret, REG_BADRPT); 432 switch (c) { 433 case '*': /* implemented as +? */ 434 /* this case does not require the (y|) trick, noKLUDGE */ 435 INSERT(OPLUS_, pos); 436 ASTERN(O_PLUS, pos); 437 INSERT(OQUEST_, pos); 438 ASTERN(O_QUEST, pos); 439 break; 440 case '+': 441 INSERT(OPLUS_, pos); 442 ASTERN(O_PLUS, pos); 443 break; 444 case '?': 445 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 446 INSERT(OCH_, pos); /* offset slightly wrong */ 447 ASTERN(OOR1, pos); /* this one's right */ 448 AHEAD(pos); /* fix the OCH_ */ 449 EMIT(OOR2, 0); /* offset very wrong... */ 450 AHEAD(THERE()); /* ...so fix it */ 451 ASTERN(O_CH, THERETHERE()); 452 break; 453 case '{': 454 count = p_count(p); 455 if (EAT(',')) { 456 if (isdigit((uch)PEEK())) { 457 count2 = p_count(p); 458 (void)REQUIRE(count <= count2, REG_BADBR); 459 } else /* single number with comma */ 460 count2 = INFINITY; 461 } else /* just a single number */ 462 count2 = count; 463 repeat(p, pos, count, count2); 464 if (!EAT('}')) { /* error heuristics */ 465 while (MORE() && PEEK() != '}') 466 NEXT(); 467 (void)REQUIRE(MORE(), REG_EBRACE); 468 SETERROR(REG_BADBR); 469 } 470 break; 471 } 472 473 if (!MORE()) 474 return; 475 c = PEEK(); 476 if (!( c == '*' || c == '+' || c == '?' || 477 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 478 return; 479 SETERROR(REG_BADRPT); 480 } 481 482 /* 483 - p_str - string (no metacharacters) "parser" 484 == static void p_str(struct parse *p); 485 */ 486 static void 487 p_str(struct parse *p) 488 { 489 (void)REQUIRE(MORE(), REG_EMPTY); 490 while (MORE()) 491 ordinary(p, WGETNEXT()); 492 } 493 494 /* 495 - p_bre - BRE parser top level, anchoring and concatenation 496 == static void p_bre(struct parse *p, int end1, \ 497 == int end2); 498 * Giving end1 as OUT essentially eliminates the end1/end2 check. 499 * 500 * This implementation is a bit of a kludge, in that a trailing $ is first 501 * taken as an ordinary character and then revised to be an anchor. 502 * The amount of lookahead needed to avoid this kludge is excessive. 503 */ 504 static void 505 p_bre(struct parse *p, 506 int end1, /* first terminating character */ 507 int end2) /* second terminating character */ 508 { 509 sopno start = HERE(); 510 int first = 1; /* first subexpression? */ 511 int wasdollar = 0; 512 513 if (EAT('^')) { 514 EMIT(OBOL, 0); 515 p->g->iflags |= USEBOL; 516 p->g->nbol++; 517 } 518 while (MORE() && !SEETWO(end1, end2)) { 519 wasdollar = p_simp_re(p, first); 520 first = 0; 521 } 522 if (wasdollar) { /* oops, that was a trailing anchor */ 523 DROP(1); 524 EMIT(OEOL, 0); 525 p->g->iflags |= USEEOL; 526 p->g->neol++; 527 } 528 529 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 530 } 531 532 /* 533 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 534 == static int p_simp_re(struct parse *p, int starordinary); 535 */ 536 static int /* was the simple RE an unbackslashed $? */ 537 p_simp_re(struct parse *p, 538 int starordinary) /* is a leading * an ordinary character? */ 539 { 540 int c; 541 int count; 542 int count2; 543 sopno pos; 544 int i; 545 wint_t wc; 546 sopno subno; 547 # define BACKSL (1<<CHAR_BIT) 548 549 pos = HERE(); /* repetion op, if any, covers from here */ 550 551 assert(MORE()); /* caller should have ensured this */ 552 c = GETNEXT(); 553 if (c == '\\') { 554 (void)REQUIRE(MORE(), REG_EESCAPE); 555 c = BACKSL | GETNEXT(); 556 } 557 switch (c) { 558 case '.': 559 if (p->g->cflags®_NEWLINE) 560 nonnewline(p); 561 else 562 EMIT(OANY, 0); 563 break; 564 case '[': 565 p_bracket(p); 566 break; 567 case BACKSL|'{': 568 SETERROR(REG_BADRPT); 569 break; 570 case BACKSL|'(': 571 p->g->nsub++; 572 subno = p->g->nsub; 573 if (subno < NPAREN) 574 p->pbegin[subno] = HERE(); 575 EMIT(OLPAREN, subno); 576 /* the MORE here is an error heuristic */ 577 if (MORE() && !SEETWO('\\', ')')) 578 p_bre(p, '\\', ')'); 579 if (subno < NPAREN) { 580 p->pend[subno] = HERE(); 581 assert(p->pend[subno] != 0); 582 } 583 EMIT(ORPAREN, subno); 584 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 585 break; 586 case BACKSL|')': /* should not get here -- must be user */ 587 case BACKSL|'}': 588 SETERROR(REG_EPAREN); 589 break; 590 case BACKSL|'1': 591 case BACKSL|'2': 592 case BACKSL|'3': 593 case BACKSL|'4': 594 case BACKSL|'5': 595 case BACKSL|'6': 596 case BACKSL|'7': 597 case BACKSL|'8': 598 case BACKSL|'9': 599 i = (c&~BACKSL) - '0'; 600 assert(i < NPAREN); 601 if (p->pend[i] != 0) { 602 assert(i <= p->g->nsub); 603 EMIT(OBACK_, i); 604 assert(p->pbegin[i] != 0); 605 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 606 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 607 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 608 EMIT(O_BACK, i); 609 } else 610 SETERROR(REG_ESUBREG); 611 p->g->backrefs = 1; 612 break; 613 case '*': 614 (void)REQUIRE(starordinary, REG_BADRPT); 615 /* FALLTHROUGH */ 616 default: 617 p->next--; 618 wc = WGETNEXT(); 619 ordinary(p, wc); 620 break; 621 } 622 623 if (EAT('*')) { /* implemented as +? */ 624 /* this case does not require the (y|) trick, noKLUDGE */ 625 INSERT(OPLUS_, pos); 626 ASTERN(O_PLUS, pos); 627 INSERT(OQUEST_, pos); 628 ASTERN(O_QUEST, pos); 629 } else if (EATTWO('\\', '{')) { 630 count = p_count(p); 631 if (EAT(',')) { 632 if (MORE() && isdigit((uch)PEEK())) { 633 count2 = p_count(p); 634 (void)REQUIRE(count <= count2, REG_BADBR); 635 } else /* single number with comma */ 636 count2 = INFINITY; 637 } else /* just a single number */ 638 count2 = count; 639 repeat(p, pos, count, count2); 640 if (!EATTWO('\\', '}')) { /* error heuristics */ 641 while (MORE() && !SEETWO('\\', '}')) 642 NEXT(); 643 (void)REQUIRE(MORE(), REG_EBRACE); 644 SETERROR(REG_BADBR); 645 } 646 } else if (c == '$') /* $ (but not \$) ends it */ 647 return(1); 648 649 return(0); 650 } 651 652 /* 653 - p_count - parse a repetition count 654 == static int p_count(struct parse *p); 655 */ 656 static int /* the value */ 657 p_count(struct parse *p) 658 { 659 int count = 0; 660 int ndigits = 0; 661 662 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 663 count = count*10 + (GETNEXT() - '0'); 664 ndigits++; 665 } 666 667 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 668 return(count); 669 } 670 671 /* 672 - p_bracket - parse a bracketed character list 673 == static void p_bracket(struct parse *p); 674 */ 675 static void 676 p_bracket(struct parse *p) 677 { 678 cset *cs; 679 wint_t ch; 680 681 /* Dept of Truly Sickening Special-Case Kludges */ 682 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 683 EMIT(OBOW, 0); 684 NEXTn(6); 685 return; 686 } 687 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 688 EMIT(OEOW, 0); 689 NEXTn(6); 690 return; 691 } 692 693 if ((cs = allocset(p)) == NULL) 694 return; 695 696 if (p->g->cflags®_ICASE) 697 cs->icase = 1; 698 if (EAT('^')) 699 cs->invert = 1; 700 if (EAT(']')) 701 CHadd(p, cs, ']'); 702 else if (EAT('-')) 703 CHadd(p, cs, '-'); 704 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 705 p_b_term(p, cs); 706 if (EAT('-')) 707 CHadd(p, cs, '-'); 708 (void)MUSTEAT(']', REG_EBRACK); 709 710 if (p->error != 0) /* don't mess things up further */ 711 return; 712 713 if (cs->invert && p->g->cflags®_NEWLINE) 714 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 715 716 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 717 ordinary(p, ch); 718 freeset(p, cs); 719 } else 720 EMIT(OANYOF, (int)(cs - p->g->sets)); 721 } 722 723 /* 724 - p_b_term - parse one term of a bracketed character list 725 == static void p_b_term(struct parse *p, cset *cs); 726 */ 727 static void 728 p_b_term(struct parse *p, cset *cs) 729 { 730 char c; 731 wint_t start, finish; 732 wint_t i; 733 734 /* classify what we've got */ 735 switch ((MORE()) ? PEEK() : '\0') { 736 case '[': 737 c = (MORE2()) ? PEEK2() : '\0'; 738 break; 739 case '-': 740 SETERROR(REG_ERANGE); 741 return; /* NOTE RETURN */ 742 break; 743 default: 744 c = '\0'; 745 break; 746 } 747 748 switch (c) { 749 case ':': /* character class */ 750 NEXT2(); 751 (void)REQUIRE(MORE(), REG_EBRACK); 752 c = PEEK(); 753 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 754 p_b_cclass(p, cs); 755 (void)REQUIRE(MORE(), REG_EBRACK); 756 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 757 break; 758 case '=': /* equivalence class */ 759 NEXT2(); 760 (void)REQUIRE(MORE(), REG_EBRACK); 761 c = PEEK(); 762 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 763 p_b_eclass(p, cs); 764 (void)REQUIRE(MORE(), REG_EBRACK); 765 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 766 break; 767 default: /* symbol, ordinary character, or range */ 768 start = p_b_symbol(p); 769 if (SEE('-') && MORE2() && PEEK2() != ']') { 770 /* range */ 771 NEXT(); 772 if (EAT('-')) 773 finish = '-'; 774 else 775 finish = p_b_symbol(p); 776 } else 777 finish = start; 778 if (start == finish) 779 CHadd(p, cs, start); 780 else { 781 if (__collate_load_error) { 782 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 783 CHaddrange(p, cs, start, finish); 784 } else { 785 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); 786 for (i = 0; i <= UCHAR_MAX; i++) { 787 if ( __collate_range_cmp(start, i) <= 0 788 && __collate_range_cmp(i, finish) <= 0 789 ) 790 CHadd(p, cs, i); 791 } 792 } 793 } 794 break; 795 } 796 } 797 798 /* 799 - p_b_cclass - parse a character-class name and deal with it 800 == static void p_b_cclass(struct parse *p, cset *cs); 801 */ 802 static void 803 p_b_cclass(struct parse *p, cset *cs) 804 { 805 char *sp = p->next; 806 size_t len; 807 wctype_t wct; 808 char clname[16]; 809 810 while (MORE() && isalpha((uch)PEEK())) 811 NEXT(); 812 len = p->next - sp; 813 if (len >= sizeof(clname) - 1) { 814 SETERROR(REG_ECTYPE); 815 return; 816 } 817 memcpy(clname, sp, len); 818 clname[len] = '\0'; 819 if ((wct = wctype(clname)) == 0) { 820 SETERROR(REG_ECTYPE); 821 return; 822 } 823 CHaddtype(p, cs, wct); 824 } 825 826 /* 827 - p_b_eclass - parse an equivalence-class name and deal with it 828 == static void p_b_eclass(struct parse *p, cset *cs); 829 * 830 * This implementation is incomplete. xxx 831 */ 832 static void 833 p_b_eclass(struct parse *p, cset *cs) 834 { 835 wint_t c; 836 837 c = p_b_coll_elem(p, '='); 838 CHadd(p, cs, c); 839 } 840 841 /* 842 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 843 == static char p_b_symbol(struct parse *p); 844 */ 845 static wint_t /* value of symbol */ 846 p_b_symbol(struct parse *p) 847 { 848 wint_t value; 849 850 (void)REQUIRE(MORE(), REG_EBRACK); 851 if (!EATTWO('[', '.')) 852 return(WGETNEXT()); 853 854 /* collating symbol */ 855 value = p_b_coll_elem(p, '.'); 856 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 857 return(value); 858 } 859 860 /* 861 - p_b_coll_elem - parse a collating-element name and look it up 862 == static char p_b_coll_elem(struct parse *p, int endc); 863 */ 864 static wint_t /* value of collating element */ 865 p_b_coll_elem(struct parse *p, 866 wint_t endc) /* name ended by endc,']' */ 867 { 868 char *sp = p->next; 869 struct cname *cp; 870 int len; 871 mbstate_t mbs; 872 wchar_t wc; 873 size_t clen; 874 875 while (MORE() && !SEETWO(endc, ']')) 876 NEXT(); 877 if (!MORE()) { 878 SETERROR(REG_EBRACK); 879 return(0); 880 } 881 len = p->next - sp; 882 for (cp = cnames; cp->name != NULL; cp++) 883 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 884 return(cp->code); /* known name */ 885 memset(&mbs, 0, sizeof(mbs)); 886 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 887 return (wc); /* single character */ 888 else if (clen == (size_t)-1 || clen == (size_t)-2) 889 SETERROR(REG_ILLSEQ); 890 else 891 SETERROR(REG_ECOLLATE); /* neither */ 892 return(0); 893 } 894 895 /* 896 - othercase - return the case counterpart of an alphabetic 897 == static char othercase(int ch); 898 */ 899 static wint_t /* if no counterpart, return ch */ 900 othercase(wint_t ch) 901 { 902 assert(iswalpha(ch)); 903 if (iswupper(ch)) 904 return(towlower(ch)); 905 else if (iswlower(ch)) 906 return(towupper(ch)); 907 else /* peculiar, but could happen */ 908 return(ch); 909 } 910 911 /* 912 - bothcases - emit a dualcase version of a two-case character 913 == static void bothcases(struct parse *p, int ch); 914 * 915 * Boy, is this implementation ever a kludge... 916 */ 917 static void 918 bothcases(struct parse *p, wint_t ch) 919 { 920 char *oldnext = p->next; 921 char *oldend = p->end; 922 char bracket[3 + MB_LEN_MAX]; 923 size_t n; 924 mbstate_t mbs; 925 926 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 927 p->next = bracket; 928 memset(&mbs, 0, sizeof(mbs)); 929 n = wcrtomb(bracket, ch, &mbs); 930 assert(n != (size_t)-1); 931 bracket[n] = ']'; 932 bracket[n + 1] = '\0'; 933 p->end = bracket+n+1; 934 p_bracket(p); 935 assert(p->next == p->end); 936 p->next = oldnext; 937 p->end = oldend; 938 } 939 940 /* 941 - ordinary - emit an ordinary character 942 == static void ordinary(struct parse *p, int ch); 943 */ 944 static void 945 ordinary(struct parse *p, wint_t ch) 946 { 947 cset *cs; 948 949 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 950 bothcases(p, ch); 951 else if ((ch & OPDMASK) == ch) 952 EMIT(OCHAR, ch); 953 else { 954 /* 955 * Kludge: character is too big to fit into an OCHAR operand. 956 * Emit a singleton set. 957 */ 958 if ((cs = allocset(p)) == NULL) 959 return; 960 CHadd(p, cs, ch); 961 EMIT(OANYOF, (int)(cs - p->g->sets)); 962 } 963 } 964 965 /* 966 - nonnewline - emit REG_NEWLINE version of OANY 967 == static void nonnewline(struct parse *p); 968 * 969 * Boy, is this implementation ever a kludge... 970 */ 971 static void 972 nonnewline(struct parse *p) 973 { 974 char *oldnext = p->next; 975 char *oldend = p->end; 976 char bracket[4]; 977 978 p->next = bracket; 979 p->end = bracket+3; 980 bracket[0] = '^'; 981 bracket[1] = '\n'; 982 bracket[2] = ']'; 983 bracket[3] = '\0'; 984 p_bracket(p); 985 assert(p->next == bracket+3); 986 p->next = oldnext; 987 p->end = oldend; 988 } 989 990 /* 991 - repeat - generate code for a bounded repetition, recursively if needed 992 == static void repeat(struct parse *p, sopno start, int from, int to); 993 */ 994 static void 995 repeat(struct parse *p, 996 sopno start, /* operand from here to end of strip */ 997 int from, /* repeated from this number */ 998 int to) /* to this number of times (maybe INFINITY) */ 999 { 1000 sopno finish = HERE(); 1001 # define N 2 1002 # define INF 3 1003 # define REP(f, t) ((f)*8 + (t)) 1004 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1005 sopno copy; 1006 1007 if (p->error != 0) /* head off possible runaway recursion */ 1008 return; 1009 1010 assert(from <= to); 1011 1012 switch (REP(MAP(from), MAP(to))) { 1013 case REP(0, 0): /* must be user doing this */ 1014 DROP(finish-start); /* drop the operand */ 1015 break; 1016 case REP(0, 1): /* as x{1,1}? */ 1017 case REP(0, N): /* as x{1,n}? */ 1018 case REP(0, INF): /* as x{1,}? */ 1019 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1020 INSERT(OCH_, start); /* offset is wrong... */ 1021 repeat(p, start+1, 1, to); 1022 ASTERN(OOR1, start); 1023 AHEAD(start); /* ... fix it */ 1024 EMIT(OOR2, 0); 1025 AHEAD(THERE()); 1026 ASTERN(O_CH, THERETHERE()); 1027 break; 1028 case REP(1, 1): /* trivial case */ 1029 /* done */ 1030 break; 1031 case REP(1, N): /* as x?x{1,n-1} */ 1032 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1033 INSERT(OCH_, start); 1034 ASTERN(OOR1, start); 1035 AHEAD(start); 1036 EMIT(OOR2, 0); /* offset very wrong... */ 1037 AHEAD(THERE()); /* ...so fix it */ 1038 ASTERN(O_CH, THERETHERE()); 1039 copy = dupl(p, start+1, finish+1); 1040 assert(copy == finish+4); 1041 repeat(p, copy, 1, to-1); 1042 break; 1043 case REP(1, INF): /* as x+ */ 1044 INSERT(OPLUS_, start); 1045 ASTERN(O_PLUS, start); 1046 break; 1047 case REP(N, N): /* as xx{m-1,n-1} */ 1048 copy = dupl(p, start, finish); 1049 repeat(p, copy, from-1, to-1); 1050 break; 1051 case REP(N, INF): /* as xx{n-1,INF} */ 1052 copy = dupl(p, start, finish); 1053 repeat(p, copy, from-1, to); 1054 break; 1055 default: /* "can't happen" */ 1056 SETERROR(REG_ASSERT); /* just in case */ 1057 break; 1058 } 1059 } 1060 1061 /* 1062 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1063 - character from the parse struct, signals a REG_ILLSEQ error if the 1064 - character can't be converted. Returns the number of bytes consumed. 1065 */ 1066 static wint_t 1067 wgetnext(struct parse *p) 1068 { 1069 mbstate_t mbs; 1070 wchar_t wc; 1071 size_t n; 1072 1073 memset(&mbs, 0, sizeof(mbs)); 1074 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1075 if (n == (size_t)-1 || n == (size_t)-2) { 1076 SETERROR(REG_ILLSEQ); 1077 return (0); 1078 } 1079 if (n == 0) 1080 n = 1; 1081 p->next += n; 1082 return (wc); 1083 } 1084 1085 /* 1086 - seterr - set an error condition 1087 == static int seterr(struct parse *p, int e); 1088 */ 1089 static int /* useless but makes type checking happy */ 1090 seterr(struct parse *p, int e) 1091 { 1092 if (p->error == 0) /* keep earliest error condition */ 1093 p->error = e; 1094 p->next = nuls; /* try to bring things to a halt */ 1095 p->end = nuls; 1096 return(0); /* make the return value well-defined */ 1097 } 1098 1099 /* 1100 - allocset - allocate a set of characters for [] 1101 == static cset *allocset(struct parse *p); 1102 */ 1103 static cset * 1104 allocset(struct parse *p) 1105 { 1106 cset *cs, *ncs; 1107 1108 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); 1109 if (ncs == NULL) { 1110 SETERROR(REG_ESPACE); 1111 return (NULL); 1112 } 1113 p->g->sets = ncs; 1114 cs = &p->g->sets[p->g->ncsets++]; 1115 memset(cs, 0, sizeof(*cs)); 1116 1117 return(cs); 1118 } 1119 1120 /* 1121 - freeset - free a now-unused set 1122 == static void freeset(struct parse *p, cset *cs); 1123 */ 1124 static void 1125 freeset(struct parse *p, cset *cs) 1126 { 1127 cset *top = &p->g->sets[p->g->ncsets]; 1128 1129 free(cs->wides); 1130 free(cs->ranges); 1131 free(cs->types); 1132 memset(cs, 0, sizeof(*cs)); 1133 if (cs == top-1) /* recover only the easy case */ 1134 p->g->ncsets--; 1135 } 1136 1137 /* 1138 - singleton - Determine whether a set contains only one character, 1139 - returning it if so, otherwise returning OUT. 1140 */ 1141 static wint_t 1142 singleton(cset *cs) 1143 { 1144 wint_t i, s, n; 1145 1146 for (i = n = 0; i < NC; i++) 1147 if (CHIN(cs, i)) { 1148 n++; 1149 s = i; 1150 } 1151 if (n == 1) 1152 return (s); 1153 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1154 cs->icase == 0) 1155 return (cs->wides[0]); 1156 /* Don't bother handling the other cases. */ 1157 return (OUT); 1158 } 1159 1160 /* 1161 - CHadd - add character to character set. 1162 */ 1163 static void 1164 CHadd(struct parse *p, cset *cs, wint_t ch) 1165 { 1166 wint_t nch, *newwides; 1167 assert(ch >= 0); 1168 if (ch < NC) 1169 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1170 else { 1171 newwides = realloc(cs->wides, (cs->nwides + 1) * 1172 sizeof(*cs->wides)); 1173 if (newwides == NULL) { 1174 SETERROR(REG_ESPACE); 1175 return; 1176 } 1177 cs->wides = newwides; 1178 cs->wides[cs->nwides++] = ch; 1179 } 1180 if (cs->icase) { 1181 if ((nch = towlower(ch)) < NC) 1182 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1183 if ((nch = towupper(ch)) < NC) 1184 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1185 } 1186 } 1187 1188 /* 1189 - CHaddrange - add all characters in the range [min,max] to a character set. 1190 */ 1191 static void 1192 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1193 { 1194 crange *newranges; 1195 1196 for (; min < NC && min <= max; min++) 1197 CHadd(p, cs, min); 1198 if (min >= max) 1199 return; 1200 newranges = realloc(cs->ranges, (cs->nranges + 1) * 1201 sizeof(*cs->ranges)); 1202 if (newranges == NULL) { 1203 SETERROR(REG_ESPACE); 1204 return; 1205 } 1206 cs->ranges = newranges; 1207 cs->ranges[cs->nranges].min = min; 1208 cs->ranges[cs->nranges].min = max; 1209 cs->nranges++; 1210 } 1211 1212 /* 1213 - CHaddtype - add all characters of a certain type to a character set. 1214 */ 1215 static void 1216 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1217 { 1218 wint_t i; 1219 wctype_t *newtypes; 1220 1221 for (i = 0; i < NC; i++) 1222 if (iswctype(i, wct)) 1223 CHadd(p, cs, i); 1224 newtypes = realloc(cs->types, (cs->ntypes + 1) * 1225 sizeof(*cs->types)); 1226 if (newtypes == NULL) { 1227 SETERROR(REG_ESPACE); 1228 return; 1229 } 1230 cs->types = newtypes; 1231 cs->types[cs->ntypes++] = wct; 1232 } 1233 1234 /* 1235 - dupl - emit a duplicate of a bunch of sops 1236 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1237 */ 1238 static sopno /* start of duplicate */ 1239 dupl(struct parse *p, 1240 sopno start, /* from here */ 1241 sopno finish) /* to this less one */ 1242 { 1243 sopno ret = HERE(); 1244 sopno len = finish - start; 1245 1246 assert(finish >= start); 1247 if (len == 0) 1248 return(ret); 1249 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1250 assert(p->ssize >= p->slen + len); 1251 (void) memcpy((char *)(p->strip + p->slen), 1252 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1253 p->slen += len; 1254 return(ret); 1255 } 1256 1257 /* 1258 - doemit - emit a strip operator 1259 == static void doemit(struct parse *p, sop op, size_t opnd); 1260 * 1261 * It might seem better to implement this as a macro with a function as 1262 * hard-case backup, but it's just too big and messy unless there are 1263 * some changes to the data structures. Maybe later. 1264 */ 1265 static void 1266 doemit(struct parse *p, sop op, size_t opnd) 1267 { 1268 /* avoid making error situations worse */ 1269 if (p->error != 0) 1270 return; 1271 1272 /* deal with oversize operands ("can't happen", more or less) */ 1273 assert(opnd < 1<<OPSHIFT); 1274 1275 /* deal with undersized strip */ 1276 if (p->slen >= p->ssize) 1277 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1278 assert(p->slen < p->ssize); 1279 1280 /* finally, it's all reduced to the easy case */ 1281 p->strip[p->slen++] = SOP(op, opnd); 1282 } 1283 1284 /* 1285 - doinsert - insert a sop into the strip 1286 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1287 */ 1288 static void 1289 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1290 { 1291 sopno sn; 1292 sop s; 1293 int i; 1294 1295 /* avoid making error situations worse */ 1296 if (p->error != 0) 1297 return; 1298 1299 sn = HERE(); 1300 EMIT(op, opnd); /* do checks, ensure space */ 1301 assert(HERE() == sn+1); 1302 s = p->strip[sn]; 1303 1304 /* adjust paren pointers */ 1305 assert(pos > 0); 1306 for (i = 1; i < NPAREN; i++) { 1307 if (p->pbegin[i] >= pos) { 1308 p->pbegin[i]++; 1309 } 1310 if (p->pend[i] >= pos) { 1311 p->pend[i]++; 1312 } 1313 } 1314 1315 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1316 (HERE()-pos-1)*sizeof(sop)); 1317 p->strip[pos] = s; 1318 } 1319 1320 /* 1321 - dofwd - complete a forward reference 1322 == static void dofwd(struct parse *p, sopno pos, sop value); 1323 */ 1324 static void 1325 dofwd(struct parse *p, sopno pos, sop value) 1326 { 1327 /* avoid making error situations worse */ 1328 if (p->error != 0) 1329 return; 1330 1331 assert(value < 1<<OPSHIFT); 1332 p->strip[pos] = OP(p->strip[pos]) | value; 1333 } 1334 1335 /* 1336 - enlarge - enlarge the strip 1337 == static void enlarge(struct parse *p, sopno size); 1338 */ 1339 static void 1340 enlarge(struct parse *p, sopno size) 1341 { 1342 sop *sp; 1343 1344 if (p->ssize >= size) 1345 return; 1346 1347 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1348 if (sp == NULL) { 1349 SETERROR(REG_ESPACE); 1350 return; 1351 } 1352 p->strip = sp; 1353 p->ssize = size; 1354 } 1355 1356 /* 1357 - stripsnug - compact the strip 1358 == static void stripsnug(struct parse *p, struct re_guts *g); 1359 */ 1360 static void 1361 stripsnug(struct parse *p, struct re_guts *g) 1362 { 1363 g->nstates = p->slen; 1364 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1365 if (g->strip == NULL) { 1366 SETERROR(REG_ESPACE); 1367 g->strip = p->strip; 1368 } 1369 } 1370 1371 /* 1372 - findmust - fill in must and mlen with longest mandatory literal string 1373 == static void findmust(struct parse *p, struct re_guts *g); 1374 * 1375 * This algorithm could do fancy things like analyzing the operands of | 1376 * for common subsequences. Someday. This code is simple and finds most 1377 * of the interesting cases. 1378 * 1379 * Note that must and mlen got initialized during setup. 1380 */ 1381 static void 1382 findmust(struct parse *p, struct re_guts *g) 1383 { 1384 sop *scan; 1385 sop *start; 1386 sop *newstart; 1387 sopno newlen; 1388 sop s; 1389 char *cp; 1390 int offset; 1391 char buf[MB_LEN_MAX]; 1392 size_t clen; 1393 mbstate_t mbs; 1394 1395 /* avoid making error situations worse */ 1396 if (p->error != 0) 1397 return; 1398 1399 /* 1400 * It's not generally safe to do a ``char'' substring search on 1401 * multibyte character strings, but it's safe for at least 1402 * UTF-8 (see RFC 3629). 1403 */ 1404 if (MB_CUR_MAX > 1 && 1405 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1406 return; 1407 1408 /* find the longest OCHAR sequence in strip */ 1409 newlen = 0; 1410 offset = 0; 1411 g->moffset = 0; 1412 scan = g->strip + 1; 1413 do { 1414 s = *scan++; 1415 switch (OP(s)) { 1416 case OCHAR: /* sequence member */ 1417 if (newlen == 0) { /* new sequence */ 1418 memset(&mbs, 0, sizeof(mbs)); 1419 newstart = scan - 1; 1420 } 1421 clen = wcrtomb(buf, OPND(s), &mbs); 1422 if (clen == (size_t)-1) 1423 goto toohard; 1424 newlen += clen; 1425 break; 1426 case OPLUS_: /* things that don't break one */ 1427 case OLPAREN: 1428 case ORPAREN: 1429 break; 1430 case OQUEST_: /* things that must be skipped */ 1431 case OCH_: 1432 offset = altoffset(scan, offset); 1433 scan--; 1434 do { 1435 scan += OPND(s); 1436 s = *scan; 1437 /* assert() interferes w debug printouts */ 1438 if (OP(s) != O_QUEST && OP(s) != O_CH && 1439 OP(s) != OOR2) { 1440 g->iflags |= BAD; 1441 return; 1442 } 1443 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1444 /* FALLTHROUGH */ 1445 case OBOW: /* things that break a sequence */ 1446 case OEOW: 1447 case OBOL: 1448 case OEOL: 1449 case O_QUEST: 1450 case O_CH: 1451 case OEND: 1452 if (newlen > g->mlen) { /* ends one */ 1453 start = newstart; 1454 g->mlen = newlen; 1455 if (offset > -1) { 1456 g->moffset += offset; 1457 offset = newlen; 1458 } else 1459 g->moffset = offset; 1460 } else { 1461 if (offset > -1) 1462 offset += newlen; 1463 } 1464 newlen = 0; 1465 break; 1466 case OANY: 1467 if (newlen > g->mlen) { /* ends one */ 1468 start = newstart; 1469 g->mlen = newlen; 1470 if (offset > -1) { 1471 g->moffset += offset; 1472 offset = newlen; 1473 } else 1474 g->moffset = offset; 1475 } else { 1476 if (offset > -1) 1477 offset += newlen; 1478 } 1479 if (offset > -1) 1480 offset++; 1481 newlen = 0; 1482 break; 1483 case OANYOF: /* may or may not invalidate offset */ 1484 /* First, everything as OANY */ 1485 if (newlen > g->mlen) { /* ends one */ 1486 start = newstart; 1487 g->mlen = newlen; 1488 if (offset > -1) { 1489 g->moffset += offset; 1490 offset = newlen; 1491 } else 1492 g->moffset = offset; 1493 } else { 1494 if (offset > -1) 1495 offset += newlen; 1496 } 1497 if (offset > -1) 1498 offset++; 1499 newlen = 0; 1500 break; 1501 toohard: 1502 default: 1503 /* Anything here makes it impossible or too hard 1504 * to calculate the offset -- so we give up; 1505 * save the last known good offset, in case the 1506 * must sequence doesn't occur later. 1507 */ 1508 if (newlen > g->mlen) { /* ends one */ 1509 start = newstart; 1510 g->mlen = newlen; 1511 if (offset > -1) 1512 g->moffset += offset; 1513 else 1514 g->moffset = offset; 1515 } 1516 offset = -1; 1517 newlen = 0; 1518 break; 1519 } 1520 } while (OP(s) != OEND); 1521 1522 if (g->mlen == 0) { /* there isn't one */ 1523 g->moffset = -1; 1524 return; 1525 } 1526 1527 /* turn it into a character string */ 1528 g->must = malloc((size_t)g->mlen + 1); 1529 if (g->must == NULL) { /* argh; just forget it */ 1530 g->mlen = 0; 1531 g->moffset = -1; 1532 return; 1533 } 1534 cp = g->must; 1535 scan = start; 1536 memset(&mbs, 0, sizeof(mbs)); 1537 while (cp < g->must + g->mlen) { 1538 while (OP(s = *scan++) != OCHAR) 1539 continue; 1540 clen = wcrtomb(cp, OPND(s), &mbs); 1541 assert(clen != (size_t)-1); 1542 cp += clen; 1543 } 1544 assert(cp == g->must + g->mlen); 1545 *cp++ = '\0'; /* just on general principles */ 1546 } 1547 1548 /* 1549 - altoffset - choose biggest offset among multiple choices 1550 == static int altoffset(sop *scan, int offset); 1551 * 1552 * Compute, recursively if necessary, the largest offset among multiple 1553 * re paths. 1554 */ 1555 static int 1556 altoffset(sop *scan, int offset) 1557 { 1558 int largest; 1559 int try; 1560 sop s; 1561 1562 /* If we gave up already on offsets, return */ 1563 if (offset == -1) 1564 return -1; 1565 1566 largest = 0; 1567 try = 0; 1568 s = *scan++; 1569 while (OP(s) != O_QUEST && OP(s) != O_CH) { 1570 switch (OP(s)) { 1571 case OOR1: 1572 if (try > largest) 1573 largest = try; 1574 try = 0; 1575 break; 1576 case OQUEST_: 1577 case OCH_: 1578 try = altoffset(scan, try); 1579 if (try == -1) 1580 return -1; 1581 scan--; 1582 do { 1583 scan += OPND(s); 1584 s = *scan; 1585 if (OP(s) != O_QUEST && OP(s) != O_CH && 1586 OP(s) != OOR2) 1587 return -1; 1588 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1589 /* We must skip to the next position, or we'll 1590 * leave altoffset() too early. 1591 */ 1592 scan++; 1593 break; 1594 case OANYOF: 1595 case OCHAR: 1596 case OANY: 1597 try++; 1598 case OBOW: 1599 case OEOW: 1600 case OLPAREN: 1601 case ORPAREN: 1602 case OOR2: 1603 break; 1604 default: 1605 try = -1; 1606 break; 1607 } 1608 if (try == -1) 1609 return -1; 1610 s = *scan++; 1611 } 1612 1613 if (try > largest) 1614 largest = try; 1615 1616 return largest+offset; 1617 } 1618 1619 /* 1620 - computejumps - compute char jumps for BM scan 1621 == static void computejumps(struct parse *p, struct re_guts *g); 1622 * 1623 * This algorithm assumes g->must exists and is has size greater than 1624 * zero. It's based on the algorithm found on Computer Algorithms by 1625 * Sara Baase. 1626 * 1627 * A char jump is the number of characters one needs to jump based on 1628 * the value of the character from the text that was mismatched. 1629 */ 1630 static void 1631 computejumps(struct parse *p, struct re_guts *g) 1632 { 1633 int ch; 1634 int mindex; 1635 1636 /* Avoid making errors worse */ 1637 if (p->error != 0) 1638 return; 1639 1640 g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 1641 if (g->charjump == NULL) /* Not a fatal error */ 1642 return; 1643 /* Adjust for signed chars, if necessary */ 1644 g->charjump = &g->charjump[-(CHAR_MIN)]; 1645 1646 /* If the character does not exist in the pattern, the jump 1647 * is equal to the number of characters in the pattern. 1648 */ 1649 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 1650 g->charjump[ch] = g->mlen; 1651 1652 /* If the character does exist, compute the jump that would 1653 * take us to the last character in the pattern equal to it 1654 * (notice that we match right to left, so that last character 1655 * is the first one that would be matched). 1656 */ 1657 for (mindex = 0; mindex < g->mlen; mindex++) 1658 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 1659 } 1660 1661 /* 1662 - computematchjumps - compute match jumps for BM scan 1663 == static void computematchjumps(struct parse *p, struct re_guts *g); 1664 * 1665 * This algorithm assumes g->must exists and is has size greater than 1666 * zero. It's based on the algorithm found on Computer Algorithms by 1667 * Sara Baase. 1668 * 1669 * A match jump is the number of characters one needs to advance based 1670 * on the already-matched suffix. 1671 * Notice that all values here are minus (g->mlen-1), because of the way 1672 * the search algorithm works. 1673 */ 1674 static void 1675 computematchjumps(struct parse *p, struct re_guts *g) 1676 { 1677 int mindex; /* General "must" iterator */ 1678 int suffix; /* Keeps track of matching suffix */ 1679 int ssuffix; /* Keeps track of suffixes' suffix */ 1680 int* pmatches; /* pmatches[k] points to the next i 1681 * such that i+1...mlen is a substring 1682 * of k+1...k+mlen-i-1 1683 */ 1684 1685 /* Avoid making errors worse */ 1686 if (p->error != 0) 1687 return; 1688 1689 pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); 1690 if (pmatches == NULL) { 1691 g->matchjump = NULL; 1692 return; 1693 } 1694 1695 g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); 1696 if (g->matchjump == NULL) /* Not a fatal error */ 1697 return; 1698 1699 /* Set maximum possible jump for each character in the pattern */ 1700 for (mindex = 0; mindex < g->mlen; mindex++) 1701 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 1702 1703 /* Compute pmatches[] */ 1704 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 1705 mindex--, suffix--) { 1706 pmatches[mindex] = suffix; 1707 1708 /* If a mismatch is found, interrupting the substring, 1709 * compute the matchjump for that position. If no 1710 * mismatch is found, then a text substring mismatched 1711 * against the suffix will also mismatch against the 1712 * substring. 1713 */ 1714 while (suffix < g->mlen 1715 && g->must[mindex] != g->must[suffix]) { 1716 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1717 g->mlen - mindex - 1); 1718 suffix = pmatches[suffix]; 1719 } 1720 } 1721 1722 /* Compute the matchjump up to the last substring found to jump 1723 * to the beginning of the largest must pattern prefix matching 1724 * it's own suffix. 1725 */ 1726 for (mindex = 0; mindex <= suffix; mindex++) 1727 g->matchjump[mindex] = MIN(g->matchjump[mindex], 1728 g->mlen + suffix - mindex); 1729 1730 ssuffix = pmatches[suffix]; 1731 while (suffix < g->mlen) { 1732 while (suffix <= ssuffix && suffix < g->mlen) { 1733 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1734 g->mlen + ssuffix - suffix); 1735 suffix++; 1736 } 1737 if (suffix < g->mlen) 1738 ssuffix = pmatches[ssuffix]; 1739 } 1740 1741 free(pmatches); 1742 } 1743 1744 /* 1745 - pluscount - count + nesting 1746 == static sopno pluscount(struct parse *p, struct re_guts *g); 1747 */ 1748 static sopno /* nesting depth */ 1749 pluscount(struct parse *p, struct re_guts *g) 1750 { 1751 sop *scan; 1752 sop s; 1753 sopno plusnest = 0; 1754 sopno maxnest = 0; 1755 1756 if (p->error != 0) 1757 return(0); /* there may not be an OEND */ 1758 1759 scan = g->strip + 1; 1760 do { 1761 s = *scan++; 1762 switch (OP(s)) { 1763 case OPLUS_: 1764 plusnest++; 1765 break; 1766 case O_PLUS: 1767 if (plusnest > maxnest) 1768 maxnest = plusnest; 1769 plusnest--; 1770 break; 1771 } 1772 } while (OP(s) != OEND); 1773 if (plusnest != 0) 1774 g->iflags |= BAD; 1775 return(maxnest); 1776 } 1777