1 /* 2 * Copyright 2013 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 4 * Copyright 2012 Milan Jurik. All rights reserved. 5 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 6 * Copyright (c) 1992, 1993, 1994 7 * The Regents of the University of California. All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * Henry Spencer. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #include "lint.h" 38 #include "file64.h" 39 #include <sys/types.h> 40 #include <stdio.h> 41 #include <string.h> 42 #include <ctype.h> 43 #include <limits.h> 44 #include <stdlib.h> 45 #include <regex.h> 46 #include <wchar.h> 47 #include <wctype.h> 48 49 #include "../locale/runetype.h" 50 #include "../locale/collate.h" 51 52 #include "utils.h" 53 #include "regex2.h" 54 55 #include "cname.h" 56 #include "../locale/mblocal.h" 57 58 /* 59 * parse structure, passed up and down to avoid global variables and 60 * other clumsinesses 61 */ 62 struct parse { 63 const char *next; /* next character in RE */ 64 const char *end; /* end of string (-> NUL normally) */ 65 int error; /* has an error been seen? */ 66 sop *strip; /* malloced strip */ 67 sopno ssize; /* malloced strip size (allocated) */ 68 sopno slen; /* malloced strip length (used) */ 69 int ncsalloc; /* number of csets allocated */ 70 struct re_guts *g; 71 #define NPAREN 10 /* we need to remember () 1-9 for back refs */ 72 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 73 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 74 }; 75 76 /* ========= begin header generated by ./mkh ========= */ 77 #ifdef __cplusplus 78 extern "C" { 79 #endif 80 81 /* === regcomp.c === */ 82 static void p_ere(struct parse *p, int stop); 83 static void p_ere_exp(struct parse *p); 84 static void p_str(struct parse *p); 85 static void p_bre(struct parse *p, int end1, int end2); 86 static int p_simp_re(struct parse *p, int starordinary); 87 static int p_count(struct parse *p); 88 static void p_bracket(struct parse *p); 89 static void p_b_term(struct parse *p, cset *cs); 90 static void p_b_cclass(struct parse *p, cset *cs); 91 static void p_b_eclass(struct parse *p, cset *cs); 92 static wint_t p_b_symbol(struct parse *p); 93 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 94 static wint_t othercase(wint_t ch); 95 static void bothcases(struct parse *p, wint_t ch); 96 static void ordinary(struct parse *p, wint_t ch); 97 static void nonnewline(struct parse *p); 98 static void repeat(struct parse *p, sopno start, int from, int to); 99 static int seterr(struct parse *p, int e); 100 static cset *allocset(struct parse *p); 101 static void freeset(struct parse *p, cset *cs); 102 static void CHadd(struct parse *p, cset *cs, wint_t ch); 103 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 104 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 105 static wint_t singleton(cset *cs); 106 static sopno dupl(struct parse *p, sopno start, sopno finish); 107 static void doemit(struct parse *p, sop op, size_t opnd); 108 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 109 static void dofwd(struct parse *p, sopno pos, sop value); 110 static int enlarge(struct parse *p, sopno size); 111 static void stripsnug(struct parse *p, struct re_guts *g); 112 static void findmust(struct parse *p, struct re_guts *g); 113 static int altoffset(sop *scan, int offset); 114 static void computejumps(struct parse *p, struct re_guts *g); 115 static void computematchjumps(struct parse *p, struct re_guts *g); 116 static sopno pluscount(struct parse *p, struct re_guts *g); 117 static wint_t wgetnext(struct parse *p); 118 119 #ifdef __cplusplus 120 } 121 #endif 122 /* ========= end header generated by ./mkh ========= */ 123 124 static char nuls[10]; /* place to point scanner in event of error */ 125 126 /* 127 * macros for use with parse structure 128 * BEWARE: these know that the parse structure is named `p' !!! 129 */ 130 #define PEEK() (*p->next) 131 #define PEEK2() (*(p->next+1)) 132 #define MORE() (p->next < p->end) 133 #define MORE2() (p->next+1 < p->end) 134 #define SEE(c) (MORE() && PEEK() == (c)) 135 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 136 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 137 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 138 #define NEXT() (p->next++) 139 #define NEXT2() (p->next += 2) 140 #define NEXTn(n) (p->next += (n)) 141 #define GETNEXT() (*p->next++) 142 #define WGETNEXT() wgetnext(p) 143 #define SETERROR(e) ((void)seterr(p, (e))) 144 #define REQUIRE(co, e) ((co) || seterr(p, e)) 145 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 146 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 147 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 148 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 149 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 150 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 151 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 152 #define HERE() (p->slen) 153 #define THERE() (p->slen - 1) 154 #define THERETHERE() (p->slen - 2) 155 #define DROP(n) (p->slen -= (n)) 156 157 #ifndef NDEBUG 158 static int never = 0; /* for use in asserts; shuts lint up */ 159 #else 160 #define never 0 /* some <assert.h>s have bugs too */ 161 #endif 162 163 /* 164 * regcomp - interface for parser and compilation 165 */ 166 int /* 0 success, otherwise REG_something */ 167 regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern, 168 int cflags) 169 { 170 struct parse pa; 171 struct re_guts *g; 172 struct parse *p = &pa; 173 int i; 174 size_t len; 175 size_t maxlen; 176 #ifdef REDEBUG 177 #define GOODFLAGS(f) (f) 178 #else 179 #define GOODFLAGS(f) ((f)&~REG_DUMP) 180 #endif 181 182 /* We had REG_INVARG, but we don't have that on Solaris. */ 183 cflags = GOODFLAGS(cflags); 184 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 185 return (REG_EFATAL); 186 187 if (cflags®_PEND) { 188 if (preg->re_endp < pattern) 189 return (REG_EFATAL); 190 len = preg->re_endp - pattern; 191 } else 192 len = strlen(pattern); 193 194 /* do the mallocs early so failure handling is easy */ 195 g = (struct re_guts *)malloc(sizeof (struct re_guts)); 196 if (g == NULL) 197 return (REG_ESPACE); 198 /* 199 * Limit the pattern space to avoid a 32-bit overflow on buffer 200 * extension. Also avoid any signed overflow in case of conversion 201 * so make the real limit based on a 31-bit overflow. 202 * 203 * Likely not applicable on 64-bit systems but handle the case 204 * generically (who are we to stop people from using ~715MB+ 205 * patterns?). 206 */ 207 maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3; 208 if (len >= maxlen) { 209 free((char *)g); 210 return (REG_ESPACE); 211 } 212 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 213 assert(p->ssize >= len); 214 215 p->strip = (sop *)malloc(p->ssize * sizeof (sop)); 216 p->slen = 0; 217 if (p->strip == NULL) { 218 free((char *)g); 219 return (REG_ESPACE); 220 } 221 222 /* set things up */ 223 p->g = g; 224 p->next = pattern; /* convenience; we do not modify it */ 225 p->end = p->next + len; 226 p->error = 0; 227 p->ncsalloc = 0; 228 for (i = 0; i < NPAREN; i++) { 229 p->pbegin[i] = 0; 230 p->pend[i] = 0; 231 } 232 g->sets = NULL; 233 g->ncsets = 0; 234 g->cflags = cflags; 235 g->iflags = 0; 236 g->nbol = 0; 237 g->neol = 0; 238 g->must = NULL; 239 g->moffset = -1; 240 g->charjump = NULL; 241 g->matchjump = NULL; 242 g->mlen = 0; 243 g->nsub = 0; 244 g->backrefs = 0; 245 246 /* do it */ 247 EMIT(OEND, 0); 248 g->firststate = THERE(); 249 if (cflags®_EXTENDED) 250 p_ere(p, OUT); 251 else if (cflags®_NOSPEC) 252 p_str(p); 253 else 254 p_bre(p, OUT, OUT); 255 EMIT(OEND, 0); 256 g->laststate = THERE(); 257 258 /* tidy up loose ends and fill things in */ 259 stripsnug(p, g); 260 findmust(p, g); 261 /* 262 * only use Boyer-Moore algorithm if the pattern is bigger 263 * than three characters 264 */ 265 if (g->mlen > 3) { 266 computejumps(p, g); 267 computematchjumps(p, g); 268 if (g->matchjump == NULL && g->charjump != NULL) { 269 free(g->charjump); 270 g->charjump = NULL; 271 } 272 } 273 g->nplus = pluscount(p, g); 274 g->magic = MAGIC2; 275 preg->re_nsub = g->nsub; 276 preg->re_g = g; 277 preg->re_magic = MAGIC1; 278 #ifndef REDEBUG 279 /* not debugging, so can't rely on the assert() in regexec() */ 280 if (g->iflags&BAD) 281 SETERROR(REG_EFATAL); 282 #endif 283 284 /* win or lose, we're done */ 285 if (p->error != 0) /* lose */ 286 regfree(preg); 287 return (p->error); 288 } 289 290 /* 291 * p_ere - ERE parser top level, concatenation and alternation 292 */ 293 static void 294 p_ere(struct parse *p, 295 int stop) /* character this ERE should end at */ 296 { 297 char c; 298 sopno prevback; 299 sopno prevfwd; 300 sopno conc; 301 int first = 1; /* is this the first alternative? */ 302 303 for (;;) { 304 /* do a bunch of concatenated expressions */ 305 conc = HERE(); 306 while (MORE() && (c = PEEK()) != '|' && c != stop) 307 p_ere_exp(p); 308 /* require nonempty */ 309 (void) REQUIRE(HERE() != conc, REG_BADPAT); 310 311 if (!EAT('|')) 312 break; /* NOTE BREAK OUT */ 313 314 if (first) { 315 INSERT(OCH_, conc); /* offset is wrong */ 316 prevfwd = conc; 317 prevback = conc; 318 first = 0; 319 } 320 ASTERN(OOR1, prevback); 321 prevback = THERE(); 322 AHEAD(prevfwd); /* fix previous offset */ 323 prevfwd = HERE(); 324 EMIT(OOR2, 0); /* offset is very wrong */ 325 } 326 327 if (!first) { /* tail-end fixups */ 328 AHEAD(prevfwd); 329 ASTERN(O_CH, prevback); 330 } 331 332 assert(!MORE() || SEE(stop)); 333 } 334 335 /* 336 * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 337 */ 338 static void 339 p_ere_exp(struct parse *p) 340 { 341 char c; 342 wint_t wc; 343 sopno pos; 344 int count; 345 int count2; 346 sopno subno; 347 int wascaret = 0; 348 349 assert(MORE()); /* caller should have ensured this */ 350 c = GETNEXT(); 351 352 pos = HERE(); 353 switch (c) { 354 case '(': 355 (void) REQUIRE(MORE(), REG_EPAREN); 356 p->g->nsub++; 357 subno = p->g->nsub; 358 if (subno < NPAREN) 359 p->pbegin[subno] = HERE(); 360 EMIT(OLPAREN, subno); 361 if (!SEE(')')) 362 p_ere(p, ')'); 363 if (subno < NPAREN) { 364 p->pend[subno] = HERE(); 365 assert(p->pend[subno] != 0); 366 } 367 EMIT(ORPAREN, subno); 368 (void) MUSTEAT(')', REG_EPAREN); 369 break; 370 #ifndef POSIX_MISTAKE 371 case ')': /* happens only if no current unmatched ( */ 372 /* 373 * You may ask, why the ifndef? Because I didn't notice 374 * this until slightly too late for 1003.2, and none of the 375 * other 1003.2 regular-expression reviewers noticed it at 376 * all. So an unmatched ) is legal POSIX, at least until 377 * we can get it fixed. 378 */ 379 SETERROR(REG_EPAREN); 380 break; 381 #endif 382 case '^': 383 EMIT(OBOL, 0); 384 p->g->iflags |= USEBOL; 385 p->g->nbol++; 386 wascaret = 1; 387 break; 388 case '$': 389 EMIT(OEOL, 0); 390 p->g->iflags |= USEEOL; 391 p->g->neol++; 392 break; 393 case '|': 394 SETERROR(REG_BADPAT); 395 break; 396 case '*': 397 case '+': 398 case '?': 399 SETERROR(REG_BADRPT); 400 break; 401 case '.': 402 if (p->g->cflags®_NEWLINE) 403 nonnewline(p); 404 else 405 EMIT(OANY, 0); 406 break; 407 case '[': 408 p_bracket(p); 409 break; 410 case '\\': 411 (void) REQUIRE(MORE(), REG_EESCAPE); 412 wc = WGETNEXT(); 413 switch (wc) { 414 case '<': 415 EMIT(OBOW, 0); 416 break; 417 case '>': 418 EMIT(OEOW, 0); 419 break; 420 default: 421 ordinary(p, wc); 422 break; 423 } 424 break; 425 case '{': /* okay as ordinary except if digit follows */ 426 (void) REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 427 /* FALLTHROUGH */ 428 default: 429 if (p->error != 0) 430 return; 431 p->next--; 432 wc = WGETNEXT(); 433 ordinary(p, wc); 434 break; 435 } 436 437 if (!MORE()) 438 return; 439 c = PEEK(); 440 /* we call { a repetition if followed by a digit */ 441 if (!(c == '*' || c == '+' || c == '?' || 442 (c == '{' && MORE2() && isdigit((uch)PEEK2())))) 443 return; /* no repetition, we're done */ 444 NEXT(); 445 446 (void) REQUIRE(!wascaret, REG_BADRPT); 447 switch (c) { 448 case '*': /* implemented as +? */ 449 /* this case does not require the (y|) trick, noKLUDGE */ 450 INSERT(OPLUS_, pos); 451 ASTERN(O_PLUS, pos); 452 INSERT(OQUEST_, pos); 453 ASTERN(O_QUEST, pos); 454 break; 455 case '+': 456 INSERT(OPLUS_, pos); 457 ASTERN(O_PLUS, pos); 458 break; 459 case '?': 460 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 461 INSERT(OCH_, pos); /* offset slightly wrong */ 462 ASTERN(OOR1, pos); /* this one's right */ 463 AHEAD(pos); /* fix the OCH_ */ 464 EMIT(OOR2, 0); /* offset very wrong... */ 465 AHEAD(THERE()); /* ...so fix it */ 466 ASTERN(O_CH, THERETHERE()); 467 break; 468 case '{': 469 count = p_count(p); 470 if (EAT(',')) { 471 if (isdigit((uch)PEEK())) { 472 count2 = p_count(p); 473 (void) REQUIRE(count <= count2, REG_BADBR); 474 } else /* single number with comma */ 475 count2 = INFINITY; 476 } else /* just a single number */ 477 count2 = count; 478 repeat(p, pos, count, count2); 479 if (!EAT('}')) { /* error heuristics */ 480 while (MORE() && PEEK() != '}') 481 NEXT(); 482 (void) REQUIRE(MORE(), REG_EBRACE); 483 SETERROR(REG_BADBR); 484 } 485 break; 486 } 487 488 if (!MORE()) 489 return; 490 c = PEEK(); 491 if (!(c == '*' || c == '+' || c == '?' || 492 (c == '{' && MORE2() && isdigit((uch)PEEK2())))) 493 return; 494 SETERROR(REG_BADRPT); 495 } 496 497 /* 498 * p_str - string (no metacharacters) "parser" 499 */ 500 static void 501 p_str(struct parse *p) 502 { 503 (void) REQUIRE(MORE(), REG_BADPAT); 504 while (MORE()) 505 ordinary(p, WGETNEXT()); 506 } 507 508 /* 509 * p_bre - BRE parser top level, anchoring and concatenation 510 * Giving end1 as OUT essentially eliminates the end1/end2 check. 511 * 512 * This implementation is a bit of a kludge, in that a trailing $ is first 513 * taken as an ordinary character and then revised to be an anchor. 514 * The amount of lookahead needed to avoid this kludge is excessive. 515 */ 516 static void 517 p_bre(struct parse *p, 518 int end1, /* first terminating character */ 519 int end2) /* second terminating character */ 520 { 521 sopno start = HERE(); 522 int first = 1; /* first subexpression? */ 523 int wasdollar = 0; 524 525 if (EAT('^')) { 526 EMIT(OBOL, 0); 527 p->g->iflags |= USEBOL; 528 p->g->nbol++; 529 } 530 while (MORE() && !SEETWO(end1, end2)) { 531 wasdollar = p_simp_re(p, first); 532 first = 0; 533 } 534 if (wasdollar) { /* oops, that was a trailing anchor */ 535 DROP(1); 536 EMIT(OEOL, 0); 537 p->g->iflags |= USEEOL; 538 p->g->neol++; 539 } 540 541 (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */ 542 } 543 544 /* 545 * p_simp_re - parse a simple RE, an atom possibly followed by a repetition 546 */ 547 static int /* was the simple RE an unbackslashed $? */ 548 p_simp_re(struct parse *p, 549 int starordinary) /* is a leading * an ordinary character? */ 550 { 551 int c; 552 int count; 553 int count2; 554 sopno pos; 555 int i; 556 wint_t wc; 557 sopno subno; 558 #define BACKSL (1<<CHAR_BIT) 559 560 pos = HERE(); /* repetition op, if any, covers from here */ 561 562 assert(MORE()); /* caller should have ensured this */ 563 c = GETNEXT(); 564 if (c == '\\') { 565 (void) REQUIRE(MORE(), REG_EESCAPE); 566 c = BACKSL | GETNEXT(); 567 } 568 switch (c) { 569 case '.': 570 if (p->g->cflags®_NEWLINE) 571 nonnewline(p); 572 else 573 EMIT(OANY, 0); 574 break; 575 case '[': 576 p_bracket(p); 577 break; 578 case BACKSL|'<': 579 EMIT(OBOW, 0); 580 break; 581 case BACKSL|'>': 582 EMIT(OEOW, 0); 583 break; 584 case BACKSL|'{': 585 SETERROR(REG_BADRPT); 586 break; 587 case BACKSL|'(': 588 p->g->nsub++; 589 subno = p->g->nsub; 590 if (subno < NPAREN) 591 p->pbegin[subno] = HERE(); 592 EMIT(OLPAREN, subno); 593 /* the MORE here is an error heuristic */ 594 if (MORE() && !SEETWO('\\', ')')) 595 p_bre(p, '\\', ')'); 596 if (subno < NPAREN) { 597 p->pend[subno] = HERE(); 598 assert(p->pend[subno] != 0); 599 } 600 EMIT(ORPAREN, subno); 601 (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 602 break; 603 case BACKSL|')': /* should not get here -- must be user */ 604 case BACKSL|'}': 605 SETERROR(REG_EPAREN); 606 break; 607 case BACKSL|'1': 608 case BACKSL|'2': 609 case BACKSL|'3': 610 case BACKSL|'4': 611 case BACKSL|'5': 612 case BACKSL|'6': 613 case BACKSL|'7': 614 case BACKSL|'8': 615 case BACKSL|'9': 616 i = (c&~BACKSL) - '0'; 617 assert(i < NPAREN); 618 if (p->pend[i] != 0) { 619 assert(i <= p->g->nsub); 620 EMIT(OBACK_, i); 621 assert(p->pbegin[i] != 0); 622 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 623 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 624 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 625 EMIT(O_BACK, i); 626 } else 627 SETERROR(REG_ESUBREG); 628 p->g->backrefs = 1; 629 break; 630 case '*': 631 (void) REQUIRE(starordinary, REG_BADRPT); 632 /* FALLTHROUGH */ 633 default: 634 if (p->error != 0) 635 return (0); /* Definitely not $... */ 636 p->next--; 637 wc = WGETNEXT(); 638 ordinary(p, wc); 639 break; 640 } 641 642 if (EAT('*')) { /* implemented as +? */ 643 /* this case does not require the (y|) trick, noKLUDGE */ 644 INSERT(OPLUS_, pos); 645 ASTERN(O_PLUS, pos); 646 INSERT(OQUEST_, pos); 647 ASTERN(O_QUEST, pos); 648 } else if (EATTWO('\\', '{')) { 649 count = p_count(p); 650 if (EAT(',')) { 651 if (MORE() && isdigit((uch)PEEK())) { 652 count2 = p_count(p); 653 (void) REQUIRE(count <= count2, REG_BADBR); 654 } else /* single number with comma */ 655 count2 = INFINITY; 656 } else /* just a single number */ 657 count2 = count; 658 repeat(p, pos, count, count2); 659 if (!EATTWO('\\', '}')) { /* error heuristics */ 660 while (MORE() && !SEETWO('\\', '}')) 661 NEXT(); 662 (void) REQUIRE(MORE(), REG_EBRACE); 663 SETERROR(REG_BADBR); 664 } 665 } else if (c == '$') /* $ (but not \$) ends it */ 666 return (1); 667 668 return (0); 669 } 670 671 /* 672 * p_count - parse a repetition count 673 */ 674 static int /* the value */ 675 p_count(struct parse *p) 676 { 677 int count = 0; 678 int ndigits = 0; 679 680 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 681 count = count*10 + (GETNEXT() - '0'); 682 ndigits++; 683 } 684 685 (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 686 return (count); 687 } 688 689 /* 690 * p_bracket - parse a bracketed character list 691 */ 692 static void 693 p_bracket(struct parse *p) 694 { 695 cset *cs; 696 wint_t ch; 697 698 /* Dept of Truly Sickening Special-Case Kludges */ 699 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 700 EMIT(OBOW, 0); 701 NEXTn(6); 702 return; 703 } 704 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 705 EMIT(OEOW, 0); 706 NEXTn(6); 707 return; 708 } 709 710 if ((cs = allocset(p)) == NULL) 711 return; 712 713 if (p->g->cflags®_ICASE) 714 cs->icase = 1; 715 if (EAT('^')) 716 cs->invert = 1; 717 if (EAT(']')) 718 CHadd(p, cs, ']'); 719 else if (EAT('-')) 720 CHadd(p, cs, '-'); 721 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 722 p_b_term(p, cs); 723 if (EAT('-')) 724 CHadd(p, cs, '-'); 725 (void) MUSTEAT(']', REG_EBRACK); 726 727 if (p->error != 0) /* don't mess things up further */ 728 return; 729 730 if (cs->invert && p->g->cflags®_NEWLINE) 731 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 732 733 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 734 ordinary(p, ch); 735 freeset(p, cs); 736 } else 737 EMIT(OANYOF, (int)(cs - p->g->sets)); 738 } 739 740 /* 741 * p_b_term - parse one term of a bracketed character list 742 */ 743 static void 744 p_b_term(struct parse *p, cset *cs) 745 { 746 char c; 747 wint_t start, finish; 748 wint_t i; 749 locale_t loc = uselocale(NULL); 750 751 /* classify what we've got */ 752 switch ((MORE()) ? PEEK() : '\0') { 753 case '[': 754 c = (MORE2()) ? PEEK2() : '\0'; 755 break; 756 case '-': 757 SETERROR(REG_ERANGE); 758 return; /* NOTE RETURN */ 759 default: 760 c = '\0'; 761 break; 762 } 763 764 switch (c) { 765 case ':': /* character class */ 766 NEXT2(); 767 (void) REQUIRE(MORE(), REG_EBRACK); 768 c = PEEK(); 769 (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE); 770 p_b_cclass(p, cs); 771 (void) REQUIRE(MORE(), REG_EBRACK); 772 (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 773 break; 774 case '=': /* equivalence class */ 775 NEXT2(); 776 (void) REQUIRE(MORE(), REG_EBRACK); 777 c = PEEK(); 778 (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 779 p_b_eclass(p, cs); 780 (void) REQUIRE(MORE(), REG_EBRACK); 781 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 782 break; 783 default: /* symbol, ordinary character, or range */ 784 start = p_b_symbol(p); 785 if (SEE('-') && MORE2() && PEEK2() != ']') { 786 /* range */ 787 NEXT(); 788 if (EAT('-')) 789 finish = '-'; 790 else 791 finish = p_b_symbol(p); 792 } else 793 finish = start; 794 if (start == finish) 795 CHadd(p, cs, start); 796 else { 797 if (loc->collate->lc_is_posix) { 798 (void) REQUIRE((uch)start <= (uch)finish, 799 REG_ERANGE); 800 CHaddrange(p, cs, start, finish); 801 } else { 802 (void) REQUIRE(_collate_range_cmp(start, 803 finish, loc) <= 0, REG_ERANGE); 804 for (i = 0; i <= UCHAR_MAX; i++) { 805 if (_collate_range_cmp(start, i, loc) 806 <= 0 && 807 _collate_range_cmp(i, finish, loc) 808 <= 0) 809 CHadd(p, cs, i); 810 } 811 } 812 } 813 break; 814 } 815 } 816 817 /* 818 * p_b_cclass - parse a character-class name and deal with it 819 */ 820 static void 821 p_b_cclass(struct parse *p, cset *cs) 822 { 823 const char *sp = p->next; 824 size_t len; 825 wctype_t wct; 826 char clname[16]; 827 828 while (MORE() && isalpha((uch)PEEK())) 829 NEXT(); 830 len = p->next - sp; 831 if (len >= sizeof (clname) - 1) { 832 SETERROR(REG_ECTYPE); 833 return; 834 } 835 (void) memcpy(clname, sp, len); 836 clname[len] = '\0'; 837 if ((wct = wctype(clname)) == 0) { 838 SETERROR(REG_ECTYPE); 839 return; 840 } 841 CHaddtype(p, cs, wct); 842 } 843 844 /* 845 * p_b_eclass - parse an equivalence-class name and deal with it 846 * 847 * This implementation is incomplete. xxx 848 */ 849 static void 850 p_b_eclass(struct parse *p, cset *cs) 851 { 852 wint_t c; 853 854 c = p_b_coll_elem(p, '='); 855 CHadd(p, cs, c); 856 } 857 858 /* 859 * p_b_symbol - parse a character or [..]ed multicharacter collating symbol 860 */ 861 static wint_t /* value of symbol */ 862 p_b_symbol(struct parse *p) 863 { 864 wint_t value; 865 866 (void) REQUIRE(MORE(), REG_EBRACK); 867 if (!EATTWO('[', '.')) 868 return (WGETNEXT()); 869 870 /* collating symbol */ 871 value = p_b_coll_elem(p, '.'); 872 (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 873 return (value); 874 } 875 876 /* 877 * p_b_coll_elem - parse a collating-element name and look it up 878 */ 879 static wint_t /* value of collating element */ 880 p_b_coll_elem(struct parse *p, 881 wint_t endc) /* name ended by endc,']' */ 882 { 883 const char *sp = p->next; 884 struct cname *cp; 885 mbstate_t mbs; 886 wchar_t wc; 887 size_t clen, len; 888 889 while (MORE() && !SEETWO(endc, ']')) 890 NEXT(); 891 if (!MORE()) { 892 SETERROR(REG_EBRACK); 893 return (0); 894 } 895 len = p->next - sp; 896 for (cp = cnames; cp->name != NULL; cp++) 897 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 898 return (cp->code); /* known name */ 899 (void) memset(&mbs, 0, sizeof (mbs)); 900 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 901 return (wc); /* single character */ 902 else if (clen == (size_t)-1 || clen == (size_t)-2) 903 SETERROR(REG_ECHAR); 904 else 905 SETERROR(REG_ECOLLATE); /* neither */ 906 return (0); 907 } 908 909 /* 910 * othercase - return the case counterpart of an alphabetic 911 */ 912 static wint_t /* if no counterpart, return ch */ 913 othercase(wint_t ch) 914 { 915 assert(iswalpha(ch)); 916 if (iswupper(ch)) 917 return (towlower(ch)); 918 else if (iswlower(ch)) 919 return (towupper(ch)); 920 else /* peculiar, but could happen */ 921 return (ch); 922 } 923 924 /* 925 * bothcases - emit a dualcase version of a two-case character 926 * 927 * Boy, is this implementation ever a kludge... 928 */ 929 static void 930 bothcases(struct parse *p, wint_t ch) 931 { 932 const char *oldnext = p->next; 933 const char *oldend = p->end; 934 char bracket[3 + MB_LEN_MAX]; 935 size_t n; 936 mbstate_t mbs; 937 938 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 939 p->next = bracket; 940 (void) memset(&mbs, 0, sizeof (mbs)); 941 n = wcrtomb(bracket, ch, &mbs); 942 assert(n != (size_t)-1); 943 bracket[n] = ']'; 944 bracket[n + 1] = '\0'; 945 p->end = bracket+n+1; 946 p_bracket(p); 947 assert(p->next == p->end); 948 p->next = oldnext; 949 p->end = oldend; 950 } 951 952 /* 953 * ordinary - emit an ordinary character 954 */ 955 static void 956 ordinary(struct parse *p, wint_t ch) 957 { 958 cset *cs; 959 960 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 961 bothcases(p, ch); 962 else if ((ch & OPDMASK) == ch) 963 EMIT(OCHAR, ch); 964 else { 965 /* 966 * Kludge: character is too big to fit into an OCHAR operand. 967 * Emit a singleton set. 968 */ 969 if ((cs = allocset(p)) == NULL) 970 return; 971 CHadd(p, cs, ch); 972 EMIT(OANYOF, (int)(cs - p->g->sets)); 973 } 974 } 975 976 /* 977 * nonnewline - emit REG_NEWLINE version of OANY 978 * 979 * Boy, is this implementation ever a kludge... 980 */ 981 static void 982 nonnewline(struct parse *p) 983 { 984 const char *oldnext = p->next; 985 const char *oldend = p->end; 986 char bracket[4]; 987 988 p->next = bracket; 989 p->end = bracket+3; 990 bracket[0] = '^'; 991 bracket[1] = '\n'; 992 bracket[2] = ']'; 993 bracket[3] = '\0'; 994 p_bracket(p); 995 assert(p->next == bracket+3); 996 p->next = oldnext; 997 p->end = oldend; 998 } 999 1000 /* 1001 * repeat - generate code for a bounded repetition, recursively if needed 1002 */ 1003 static void 1004 repeat(struct parse *p, 1005 sopno start, /* operand from here to end of strip */ 1006 int from, /* repeated from this number */ 1007 int to) /* to this number of times (maybe INFINITY) */ 1008 { 1009 sopno finish = HERE(); 1010 #define N 2 1011 #define INF 3 1012 #define REP(f, t) ((f)*8 + (t)) 1013 #define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1014 sopno copy; 1015 1016 if (p->error != 0) /* head off possible runaway recursion */ 1017 return; 1018 1019 assert(from <= to); 1020 1021 switch (REP(MAP(from), MAP(to))) { 1022 case REP(0, 0): /* must be user doing this */ 1023 DROP(finish-start); /* drop the operand */ 1024 break; 1025 case REP(0, 1): /* as x{1,1}? */ 1026 case REP(0, N): /* as x{1,n}? */ 1027 case REP(0, INF): /* as x{1,}? */ 1028 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1029 INSERT(OCH_, start); /* offset is wrong... */ 1030 repeat(p, start+1, 1, to); 1031 ASTERN(OOR1, start); 1032 AHEAD(start); /* ... fix it */ 1033 EMIT(OOR2, 0); 1034 AHEAD(THERE()); 1035 ASTERN(O_CH, THERETHERE()); 1036 break; 1037 case REP(1, 1): /* trivial case */ 1038 /* done */ 1039 break; 1040 case REP(1, N): /* as x?x{1,n-1} */ 1041 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1042 INSERT(OCH_, start); 1043 ASTERN(OOR1, start); 1044 AHEAD(start); 1045 EMIT(OOR2, 0); /* offset very wrong... */ 1046 AHEAD(THERE()); /* ...so fix it */ 1047 ASTERN(O_CH, THERETHERE()); 1048 copy = dupl(p, start+1, finish+1); 1049 assert(copy == finish+4); 1050 repeat(p, copy, 1, to-1); 1051 break; 1052 case REP(1, INF): /* as x+ */ 1053 INSERT(OPLUS_, start); 1054 ASTERN(O_PLUS, start); 1055 break; 1056 case REP(N, N): /* as xx{m-1,n-1} */ 1057 copy = dupl(p, start, finish); 1058 repeat(p, copy, from-1, to-1); 1059 break; 1060 case REP(N, INF): /* as xx{n-1,INF} */ 1061 copy = dupl(p, start, finish); 1062 repeat(p, copy, from-1, to); 1063 break; 1064 default: /* "can't happen" */ 1065 SETERROR(REG_EFATAL); /* just in case */ 1066 break; 1067 } 1068 } 1069 1070 /* 1071 * wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1072 * character from the parse struct, signals a REG_ILLSEQ error if the 1073 * character can't be converted. Returns the number of bytes consumed. 1074 */ 1075 static wint_t 1076 wgetnext(struct parse *p) 1077 { 1078 mbstate_t mbs; 1079 wchar_t wc; 1080 size_t n; 1081 1082 (void) memset(&mbs, 0, sizeof (mbs)); 1083 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1084 if (n == (size_t)-1 || n == (size_t)-2) { 1085 SETERROR(REG_ECHAR); 1086 return (0); 1087 } 1088 if (n == 0) 1089 n = 1; 1090 p->next += n; 1091 return (wc); 1092 } 1093 1094 /* 1095 * seterr - set an error condition 1096 */ 1097 static int /* useless but makes type checking happy */ 1098 seterr(struct parse *p, int e) 1099 { 1100 if (p->error == 0) /* keep earliest error condition */ 1101 p->error = e; 1102 p->next = nuls; /* try to bring things to a halt */ 1103 p->end = nuls; 1104 return (0); /* make the return value well-defined */ 1105 } 1106 1107 /* 1108 * allocset - allocate a set of characters for [] 1109 */ 1110 static cset * 1111 allocset(struct parse *p) 1112 { 1113 cset *cs, *ncs; 1114 1115 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs)); 1116 if (ncs == NULL) { 1117 SETERROR(REG_ESPACE); 1118 return (NULL); 1119 } 1120 p->g->sets = ncs; 1121 cs = &p->g->sets[p->g->ncsets++]; 1122 (void) memset(cs, 0, sizeof (*cs)); 1123 1124 return (cs); 1125 } 1126 1127 /* 1128 * freeset - free a now-unused set 1129 */ 1130 static void 1131 freeset(struct parse *p, cset *cs) 1132 { 1133 cset *top = &p->g->sets[p->g->ncsets]; 1134 1135 free(cs->wides); 1136 free(cs->ranges); 1137 free(cs->types); 1138 (void) memset(cs, 0, sizeof (*cs)); 1139 if (cs == top-1) /* recover only the easy case */ 1140 p->g->ncsets--; 1141 } 1142 1143 /* 1144 * singleton - Determine whether a set contains only one character, 1145 * returning it if so, otherwise returning OUT. 1146 */ 1147 static wint_t 1148 singleton(cset *cs) 1149 { 1150 wint_t i, s, n; 1151 1152 for (i = n = 0; i < NC; i++) 1153 if (CHIN(cs, i)) { 1154 n++; 1155 s = i; 1156 } 1157 if (n == 1) 1158 return (s); 1159 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1160 cs->icase == 0) 1161 return (cs->wides[0]); 1162 /* Don't bother handling the other cases. */ 1163 return (OUT); 1164 } 1165 1166 /* 1167 * CHadd - add character to character set. 1168 */ 1169 static void 1170 CHadd(struct parse *p, cset *cs, wint_t ch) 1171 { 1172 wint_t nch, *newwides; 1173 assert(ch >= 0); 1174 if (ch < NC) 1175 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1176 else { 1177 newwides = realloc(cs->wides, (cs->nwides + 1) * 1178 sizeof (*cs->wides)); 1179 if (newwides == NULL) { 1180 SETERROR(REG_ESPACE); 1181 return; 1182 } 1183 cs->wides = newwides; 1184 cs->wides[cs->nwides++] = ch; 1185 } 1186 if (cs->icase) { 1187 if ((nch = towlower(ch)) < NC) 1188 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1189 if ((nch = towupper(ch)) < NC) 1190 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1191 } 1192 } 1193 1194 /* 1195 * CHaddrange - add all characters in the range [min,max] to a character set. 1196 */ 1197 static void 1198 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1199 { 1200 crange *newranges; 1201 1202 for (; min < NC && min <= max; min++) 1203 CHadd(p, cs, min); 1204 if (min >= max) 1205 return; 1206 newranges = realloc(cs->ranges, (cs->nranges + 1) * 1207 sizeof (*cs->ranges)); 1208 if (newranges == NULL) { 1209 SETERROR(REG_ESPACE); 1210 return; 1211 } 1212 cs->ranges = newranges; 1213 cs->ranges[cs->nranges].min = min; 1214 cs->ranges[cs->nranges].max = max; 1215 cs->nranges++; 1216 } 1217 1218 /* 1219 * CHaddtype - add all characters of a certain type to a character set. 1220 */ 1221 static void 1222 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1223 { 1224 wint_t i; 1225 wctype_t *newtypes; 1226 1227 for (i = 0; i < NC; i++) 1228 if (iswctype(i, wct)) 1229 CHadd(p, cs, i); 1230 newtypes = realloc(cs->types, (cs->ntypes + 1) * 1231 sizeof (*cs->types)); 1232 if (newtypes == NULL) { 1233 SETERROR(REG_ESPACE); 1234 return; 1235 } 1236 cs->types = newtypes; 1237 cs->types[cs->ntypes++] = wct; 1238 } 1239 1240 /* 1241 * dupl - emit a duplicate of a bunch of sops 1242 */ 1243 static sopno /* start of duplicate */ 1244 dupl(struct parse *p, 1245 sopno start, /* from here */ 1246 sopno finish) /* to this less one */ 1247 { 1248 sopno ret = HERE(); 1249 sopno len = finish - start; 1250 1251 assert(finish >= start); 1252 if (len == 0) 1253 return (ret); 1254 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1255 return (ret); 1256 assert(p->ssize >= p->slen + len); 1257 (void) memcpy((char *)(p->strip + p->slen), 1258 (char *)(p->strip + start), (size_t)len*sizeof (sop)); 1259 p->slen += len; 1260 return (ret); 1261 } 1262 1263 /* 1264 * doemit - emit a strip operator 1265 * 1266 * It might seem better to implement this as a macro with a function as 1267 * hard-case backup, but it's just too big and messy unless there are 1268 * some changes to the data structures. Maybe later. 1269 */ 1270 static void 1271 doemit(struct parse *p, sop op, size_t opnd) 1272 { 1273 /* avoid making error situations worse */ 1274 if (p->error != 0) 1275 return; 1276 1277 /* deal with oversize operands ("can't happen", more or less) */ 1278 assert(opnd < 1<<OPSHIFT); 1279 1280 /* deal with undersized strip */ 1281 if (p->slen >= p->ssize) 1282 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1283 return; 1284 1285 /* finally, it's all reduced to the easy case */ 1286 p->strip[p->slen++] = SOP(op, opnd); 1287 } 1288 1289 /* 1290 * doinsert - insert a sop into the strip 1291 */ 1292 static void 1293 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1294 { 1295 sopno sn; 1296 sop s; 1297 int i; 1298 1299 /* avoid making error situations worse */ 1300 if (p->error != 0) 1301 return; 1302 1303 sn = HERE(); 1304 EMIT(op, opnd); /* do checks, ensure space */ 1305 assert(HERE() == sn+1); 1306 s = p->strip[sn]; 1307 1308 /* adjust paren pointers */ 1309 assert(pos > 0); 1310 for (i = 1; i < NPAREN; i++) { 1311 if (p->pbegin[i] >= pos) { 1312 p->pbegin[i]++; 1313 } 1314 if (p->pend[i] >= pos) { 1315 p->pend[i]++; 1316 } 1317 } 1318 1319 (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1320 (HERE()-pos-1)*sizeof (sop)); 1321 p->strip[pos] = s; 1322 } 1323 1324 /* 1325 * dofwd - complete a forward reference 1326 */ 1327 static void 1328 dofwd(struct parse *p, sopno pos, sop value) 1329 { 1330 /* avoid making error situations worse */ 1331 if (p->error != 0) 1332 return; 1333 1334 assert(value < 1<<OPSHIFT); 1335 p->strip[pos] = OP(p->strip[pos]) | value; 1336 } 1337 1338 /* 1339 * enlarge - enlarge the strip 1340 */ 1341 static int 1342 enlarge(struct parse *p, sopno size) 1343 { 1344 sop *sp; 1345 1346 if (p->ssize >= size) 1347 return (1); 1348 1349 sp = (sop *)realloc(p->strip, size*sizeof (sop)); 1350 if (sp == NULL) { 1351 SETERROR(REG_ESPACE); 1352 return (0); 1353 } 1354 p->strip = sp; 1355 p->ssize = size; 1356 return (1); 1357 } 1358 1359 /* 1360 * stripsnug - compact the strip 1361 */ 1362 static void 1363 stripsnug(struct parse *p, struct re_guts *g) 1364 { 1365 g->nstates = p->slen; 1366 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop)); 1367 if (g->strip == NULL) { 1368 SETERROR(REG_ESPACE); 1369 g->strip = p->strip; 1370 } 1371 } 1372 1373 /* 1374 * findmust - fill in must and mlen with longest mandatory literal string 1375 * 1376 * This algorithm could do fancy things like analyzing the operands of | 1377 * for common subsequences. Someday. This code is simple and finds most 1378 * of the interesting cases. 1379 * 1380 * Note that must and mlen got initialized during setup. 1381 */ 1382 static void 1383 findmust(struct parse *p, struct re_guts *g) 1384 { 1385 sop *scan; 1386 sop *start = NULL; 1387 sop *newstart = NULL; 1388 sopno newlen; 1389 sop s; 1390 char *cp; 1391 int offset; 1392 char buf[MB_LEN_MAX]; 1393 size_t clen; 1394 mbstate_t mbs; 1395 locale_t loc = uselocale(NULL); 1396 1397 /* avoid making error situations worse */ 1398 if (p->error != 0) 1399 return; 1400 1401 /* 1402 * It's not generally safe to do a ``char'' substring search on 1403 * multibyte character strings, but it's safe for at least 1404 * UTF-8 (see RFC 3629). 1405 */ 1406 if (MB_CUR_MAX > 1 && 1407 strcmp(loc->runelocale->__encoding, "UTF-8") != 0) 1408 return; 1409 1410 /* find the longest OCHAR sequence in strip */ 1411 newlen = 0; 1412 offset = 0; 1413 g->moffset = 0; 1414 scan = g->strip + 1; 1415 do { 1416 s = *scan++; 1417 switch (OP(s)) { 1418 case OCHAR: /* sequence member */ 1419 if (newlen == 0) { /* new sequence */ 1420 (void) memset(&mbs, 0, sizeof (mbs)); 1421 newstart = scan - 1; 1422 } 1423 clen = wcrtomb(buf, OPND(s), &mbs); 1424 if (clen == (size_t)-1) 1425 goto toohard; 1426 newlen += clen; 1427 break; 1428 case OPLUS_: /* things that don't break one */ 1429 case OLPAREN: 1430 case ORPAREN: 1431 break; 1432 case OQUEST_: /* things that must be skipped */ 1433 case OCH_: 1434 offset = altoffset(scan, offset); 1435 scan--; 1436 do { 1437 scan += OPND(s); 1438 s = *scan; 1439 /* assert() interferes w debug printouts */ 1440 if (OP(s) != O_QUEST && OP(s) != O_CH && 1441 OP(s) != OOR2) { 1442 g->iflags |= BAD; 1443 return; 1444 } 1445 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1446 /* FALLTHROUGH */ 1447 case OBOW: /* things that break a sequence */ 1448 case OEOW: 1449 case OBOL: 1450 case OEOL: 1451 case O_QUEST: 1452 case O_CH: 1453 case OEND: 1454 if (newlen > g->mlen) { /* ends one */ 1455 start = newstart; 1456 g->mlen = newlen; 1457 if (offset > -1) { 1458 g->moffset += offset; 1459 offset = newlen; 1460 } else 1461 g->moffset = offset; 1462 } else { 1463 if (offset > -1) 1464 offset += newlen; 1465 } 1466 newlen = 0; 1467 break; 1468 case OANY: 1469 if (newlen > g->mlen) { /* ends one */ 1470 start = newstart; 1471 g->mlen = newlen; 1472 if (offset > -1) { 1473 g->moffset += offset; 1474 offset = newlen; 1475 } else 1476 g->moffset = offset; 1477 } else { 1478 if (offset > -1) 1479 offset += newlen; 1480 } 1481 if (offset > -1) 1482 offset++; 1483 newlen = 0; 1484 break; 1485 case OANYOF: /* may or may not invalidate offset */ 1486 /* First, everything as OANY */ 1487 if (newlen > g->mlen) { /* ends one */ 1488 start = newstart; 1489 g->mlen = newlen; 1490 if (offset > -1) { 1491 g->moffset += offset; 1492 offset = newlen; 1493 } else 1494 g->moffset = offset; 1495 } else { 1496 if (offset > -1) 1497 offset += newlen; 1498 } 1499 if (offset > -1) 1500 offset++; 1501 newlen = 0; 1502 break; 1503 toohard: 1504 default: 1505 /* 1506 * Anything here makes it impossible or too hard 1507 * to calculate the offset -- so we give up; 1508 * save the last known good offset, in case the 1509 * must sequence doesn't occur later. 1510 */ 1511 if (newlen > g->mlen) { /* ends one */ 1512 start = newstart; 1513 g->mlen = newlen; 1514 if (offset > -1) 1515 g->moffset += offset; 1516 else 1517 g->moffset = offset; 1518 } 1519 offset = -1; 1520 newlen = 0; 1521 break; 1522 } 1523 } while (OP(s) != OEND); 1524 1525 if (g->mlen == 0) { /* there isn't one */ 1526 g->moffset = -1; 1527 return; 1528 } 1529 1530 /* turn it into a character string */ 1531 g->must = malloc((size_t)g->mlen + 1); 1532 if (g->must == NULL) { /* argh; just forget it */ 1533 g->mlen = 0; 1534 g->moffset = -1; 1535 return; 1536 } 1537 cp = g->must; 1538 scan = start; 1539 (void) memset(&mbs, 0, sizeof (mbs)); 1540 while (cp < g->must + g->mlen) { 1541 while (OP(s = *scan++) != OCHAR) 1542 continue; 1543 clen = wcrtomb(cp, OPND(s), &mbs); 1544 assert(clen != (size_t)-1); 1545 cp += clen; 1546 } 1547 assert(cp == g->must + g->mlen); 1548 *cp++ = '\0'; /* just on general principles */ 1549 } 1550 1551 /* 1552 * altoffset - choose biggest offset among multiple choices 1553 * 1554 * Compute, recursively if necessary, the largest offset among multiple 1555 * re paths. 1556 */ 1557 static int 1558 altoffset(sop *scan, int offset) 1559 { 1560 int largest; 1561 int try; 1562 sop s; 1563 1564 /* If we gave up already on offsets, return */ 1565 if (offset == -1) 1566 return (-1); 1567 1568 largest = 0; 1569 try = 0; 1570 s = *scan++; 1571 while (OP(s) != O_QUEST && OP(s) != O_CH) { 1572 switch (OP(s)) { 1573 case OOR1: 1574 if (try > largest) 1575 largest = try; 1576 try = 0; 1577 break; 1578 case OQUEST_: 1579 case OCH_: 1580 try = altoffset(scan, try); 1581 if (try == -1) 1582 return (-1); 1583 scan--; 1584 do { 1585 scan += OPND(s); 1586 s = *scan; 1587 if (OP(s) != O_QUEST && OP(s) != O_CH && 1588 OP(s) != OOR2) 1589 return (-1); 1590 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1591 /* 1592 * We must skip to the next position, or we'll 1593 * leave altoffset() too early. 1594 */ 1595 scan++; 1596 break; 1597 case OANYOF: 1598 case OCHAR: 1599 case OANY: 1600 try++; 1601 /*FALLTHRU*/ 1602 case OBOW: 1603 case OEOW: 1604 case OLPAREN: 1605 case ORPAREN: 1606 case OOR2: 1607 break; 1608 default: 1609 try = -1; 1610 break; 1611 } 1612 if (try == -1) 1613 return (-1); 1614 s = *scan++; 1615 } 1616 1617 if (try > largest) 1618 largest = try; 1619 1620 return (largest+offset); 1621 } 1622 1623 /* 1624 * computejumps - compute char jumps for BM scan 1625 * 1626 * This algorithm assumes g->must exists and is has size greater than 1627 * zero. It's based on the algorithm found on Computer Algorithms by 1628 * Sara Baase. 1629 * 1630 * A char jump is the number of characters one needs to jump based on 1631 * the value of the character from the text that was mismatched. 1632 */ 1633 static void 1634 computejumps(struct parse *p, struct re_guts *g) 1635 { 1636 int ch; 1637 int mindex; 1638 1639 /* Avoid making errors worse */ 1640 if (p->error != 0) 1641 return; 1642 1643 g->charjump = (int *)malloc((NC + 1) * sizeof (int)); 1644 if (g->charjump == NULL) /* Not a fatal error */ 1645 return; 1646 /* Adjust for signed chars, if necessary */ 1647 g->charjump = &g->charjump[-(CHAR_MIN)]; 1648 1649 /* 1650 * If the character does not exist in the pattern, the jump 1651 * is equal to the number of characters in the pattern. 1652 */ 1653 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 1654 g->charjump[ch] = g->mlen; 1655 1656 /* 1657 * If the character does exist, compute the jump that would 1658 * take us to the last character in the pattern equal to it 1659 * (notice that we match right to left, so that last character 1660 * is the first one that would be matched). 1661 */ 1662 for (mindex = 0; mindex < g->mlen; mindex++) 1663 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 1664 } 1665 1666 /* 1667 * computematchjumps - compute match jumps for BM scan 1668 * 1669 * This algorithm assumes g->must exists and is has size greater than 1670 * zero. It's based on the algorithm found on Computer Algorithms by 1671 * Sara Baase. 1672 * 1673 * A match jump is the number of characters one needs to advance based 1674 * on the already-matched suffix. 1675 * Notice that all values here are minus (g->mlen-1), because of the way 1676 * the search algorithm works. 1677 */ 1678 static void 1679 computematchjumps(struct parse *p, struct re_guts *g) 1680 { 1681 int mindex; /* General "must" iterator */ 1682 int suffix; /* Keeps track of matching suffix */ 1683 int ssuffix; /* Keeps track of suffixes' suffix */ 1684 int *pmatches; 1685 /* 1686 * pmatches[k] points to the next i 1687 * such that i+1...mlen is a substring 1688 * of k+1...k+mlen-i-1 1689 */ 1690 1691 /* Avoid making errors worse */ 1692 if (p->error != 0) 1693 return; 1694 1695 pmatches = (int *)malloc(g->mlen * sizeof (unsigned int)); 1696 if (pmatches == NULL) { 1697 g->matchjump = NULL; 1698 return; 1699 } 1700 1701 g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int)); 1702 if (g->matchjump == NULL) { /* Not a fatal error */ 1703 free(pmatches); 1704 return; 1705 } 1706 1707 /* Set maximum possible jump for each character in the pattern */ 1708 for (mindex = 0; mindex < g->mlen; mindex++) 1709 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 1710 1711 /* Compute pmatches[] */ 1712 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 1713 mindex--, suffix--) { 1714 pmatches[mindex] = suffix; 1715 1716 /* 1717 * If a mismatch is found, interrupting the substring, 1718 * compute the matchjump for that position. If no 1719 * mismatch is found, then a text substring mismatched 1720 * against the suffix will also mismatch against the 1721 * substring. 1722 */ 1723 while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) { 1724 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1725 g->mlen - mindex - 1); 1726 suffix = pmatches[suffix]; 1727 } 1728 } 1729 1730 /* 1731 * Compute the matchjump up to the last substring found to jump 1732 * to the beginning of the largest must pattern prefix matching 1733 * it's own suffix. 1734 */ 1735 for (mindex = 0; mindex <= suffix; mindex++) 1736 g->matchjump[mindex] = MIN(g->matchjump[mindex], 1737 g->mlen + suffix - mindex); 1738 1739 ssuffix = pmatches[suffix]; 1740 while (suffix < g->mlen) { 1741 while (suffix <= ssuffix && suffix < g->mlen) { 1742 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1743 g->mlen + ssuffix - suffix); 1744 suffix++; 1745 } 1746 if (suffix < g->mlen) 1747 ssuffix = pmatches[ssuffix]; 1748 } 1749 1750 free(pmatches); 1751 } 1752 1753 /* 1754 * pluscount - count + nesting 1755 */ 1756 static sopno /* nesting depth */ 1757 pluscount(struct parse *p, struct re_guts *g) 1758 { 1759 sop *scan; 1760 sop s; 1761 sopno plusnest = 0; 1762 sopno maxnest = 0; 1763 1764 if (p->error != 0) 1765 return (0); /* there may not be an OEND */ 1766 1767 scan = g->strip + 1; 1768 do { 1769 s = *scan++; 1770 switch (OP(s)) { 1771 case OPLUS_: 1772 plusnest++; 1773 break; 1774 case O_PLUS: 1775 if (plusnest > maxnest) 1776 maxnest = plusnest; 1777 plusnest--; 1778 break; 1779 } 1780 } while (OP(s) != OEND); 1781 if (plusnest != 0) 1782 g->iflags |= BAD; 1783 return (maxnest); 1784 } 1785