1 /* 2 * Copyright 2013 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 4 * Copyright 2012 Milan Jurik. All rights reserved. 5 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 6 * Copyright (c) 1992, 1993, 1994 7 * The Regents of the University of California. All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * Henry Spencer. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #include "lint.h" 38 #include "file64.h" 39 #include <sys/types.h> 40 #include <stdio.h> 41 #include <string.h> 42 #include <ctype.h> 43 #include <limits.h> 44 #include <stdlib.h> 45 #include <regex.h> 46 #include <wchar.h> 47 #include <wctype.h> 48 49 #include "../locale/runetype.h" 50 #include "../locale/collate.h" 51 52 #include "utils.h" 53 #include "regex2.h" 54 55 #include "cname.h" 56 #include "../locale/mblocal.h" 57 58 /* 59 * parse structure, passed up and down to avoid global variables and 60 * other clumsinesses 61 */ 62 struct parse { 63 const char *next; /* next character in RE */ 64 const char *end; /* end of string (-> NUL normally) */ 65 int error; /* has an error been seen? */ 66 sop *strip; /* malloced strip */ 67 sopno ssize; /* malloced strip size (allocated) */ 68 sopno slen; /* malloced strip length (used) */ 69 int ncsalloc; /* number of csets allocated */ 70 struct re_guts *g; 71 #define NPAREN 10 /* we need to remember () 1-9 for back refs */ 72 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 73 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 74 }; 75 76 /* ========= begin header generated by ./mkh ========= */ 77 #ifdef __cplusplus 78 extern "C" { 79 #endif 80 81 /* === regcomp.c === */ 82 static void p_ere(struct parse *p, int stop); 83 static void p_ere_exp(struct parse *p); 84 static void p_str(struct parse *p); 85 static void p_bre(struct parse *p, int end1, int end2); 86 static int p_simp_re(struct parse *p, int starordinary); 87 static int p_count(struct parse *p); 88 static void p_bracket(struct parse *p); 89 static void p_b_term(struct parse *p, cset *cs); 90 static void p_b_cclass(struct parse *p, cset *cs); 91 static void p_b_eclass(struct parse *p, cset *cs); 92 static wint_t p_b_symbol(struct parse *p); 93 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 94 static wint_t othercase(wint_t ch); 95 static void bothcases(struct parse *p, wint_t ch); 96 static void ordinary(struct parse *p, wint_t ch); 97 static void nonnewline(struct parse *p); 98 static void repeat(struct parse *p, sopno start, int from, int to); 99 static int seterr(struct parse *p, int e); 100 static cset *allocset(struct parse *p); 101 static void freeset(struct parse *p, cset *cs); 102 static void CHadd(struct parse *p, cset *cs, wint_t ch); 103 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 104 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 105 static wint_t singleton(cset *cs); 106 static sopno dupl(struct parse *p, sopno start, sopno finish); 107 static void doemit(struct parse *p, sop op, size_t opnd); 108 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 109 static void dofwd(struct parse *p, sopno pos, sop value); 110 static int enlarge(struct parse *p, sopno size); 111 static void stripsnug(struct parse *p, struct re_guts *g); 112 static void findmust(struct parse *p, struct re_guts *g); 113 static int altoffset(sop *scan, int offset); 114 static void computejumps(struct parse *p, struct re_guts *g); 115 static void computematchjumps(struct parse *p, struct re_guts *g); 116 static sopno pluscount(struct parse *p, struct re_guts *g); 117 static wint_t wgetnext(struct parse *p); 118 119 #ifdef __cplusplus 120 } 121 #endif 122 /* ========= end header generated by ./mkh ========= */ 123 124 static char nuls[10]; /* place to point scanner in event of error */ 125 126 /* 127 * macros for use with parse structure 128 * BEWARE: these know that the parse structure is named `p' !!! 129 */ 130 #define PEEK() (*p->next) 131 #define PEEK2() (*(p->next+1)) 132 #define MORE() (p->next < p->end) 133 #define MORE2() (p->next+1 < p->end) 134 #define SEE(c) (MORE() && PEEK() == (c)) 135 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 136 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 137 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 138 #define NEXT() (p->next++) 139 #define NEXT2() (p->next += 2) 140 #define NEXTn(n) (p->next += (n)) 141 #define GETNEXT() (*p->next++) 142 #define WGETNEXT() wgetnext(p) 143 #define SETERROR(e) ((void)seterr(p, (e))) 144 #define REQUIRE(co, e) ((co) || seterr(p, e)) 145 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 146 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 147 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 148 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 149 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 150 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 151 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 152 #define HERE() (p->slen) 153 #define THERE() (p->slen - 1) 154 #define THERETHERE() (p->slen - 2) 155 #define DROP(n) (p->slen -= (n)) 156 157 #ifndef NDEBUG 158 static int never = 0; /* for use in asserts; shuts lint up */ 159 #else 160 #define never 0 /* some <assert.h>s have bugs too */ 161 #endif 162 163 /* 164 * regcomp - interface for parser and compilation 165 */ 166 int /* 0 success, otherwise REG_something */ 167 regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern, 168 int cflags) 169 { 170 struct parse pa; 171 struct re_guts *g; 172 struct parse *p = &pa; 173 int i; 174 size_t len; 175 size_t maxlen; 176 #ifdef REDEBUG 177 #define GOODFLAGS(f) (f) 178 #else 179 #define GOODFLAGS(f) ((f)&~REG_DUMP) 180 #endif 181 182 /* We had REG_INVARG, but we don't have that on Solaris. */ 183 cflags = GOODFLAGS(cflags); 184 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 185 return (REG_EFATAL); 186 187 if (cflags®_PEND) { 188 if (preg->re_endp < pattern) 189 return (REG_EFATAL); 190 len = preg->re_endp - pattern; 191 } else 192 len = strlen(pattern); 193 194 /* do the mallocs early so failure handling is easy */ 195 g = (struct re_guts *)malloc(sizeof (struct re_guts)); 196 if (g == NULL) 197 return (REG_ESPACE); 198 /* 199 * Limit the pattern space to avoid a 32-bit overflow on buffer 200 * extension. Also avoid any signed overflow in case of conversion 201 * so make the real limit based on a 31-bit overflow. 202 * 203 * Likely not applicable on 64-bit systems but handle the case 204 * generically (who are we to stop people from using ~715MB+ 205 * patterns?). 206 */ 207 maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3; 208 if (len >= maxlen) { 209 free((char *)g); 210 return (REG_ESPACE); 211 } 212 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 213 assert(p->ssize >= len); 214 215 p->strip = (sop *)malloc(p->ssize * sizeof (sop)); 216 p->slen = 0; 217 if (p->strip == NULL) { 218 free((char *)g); 219 return (REG_ESPACE); 220 } 221 222 /* set things up */ 223 p->g = g; 224 p->next = pattern; /* convenience; we do not modify it */ 225 p->end = p->next + len; 226 p->error = 0; 227 p->ncsalloc = 0; 228 for (i = 0; i < NPAREN; i++) { 229 p->pbegin[i] = 0; 230 p->pend[i] = 0; 231 } 232 g->sets = NULL; 233 g->ncsets = 0; 234 g->cflags = cflags; 235 g->iflags = 0; 236 g->nbol = 0; 237 g->neol = 0; 238 g->must = NULL; 239 g->moffset = -1; 240 g->charjump = NULL; 241 g->matchjump = NULL; 242 g->mlen = 0; 243 g->nsub = 0; 244 g->backrefs = 0; 245 246 /* do it */ 247 EMIT(OEND, 0); 248 g->firststate = THERE(); 249 if (cflags®_EXTENDED) 250 p_ere(p, OUT); 251 else if (cflags®_NOSPEC) 252 p_str(p); 253 else 254 p_bre(p, OUT, OUT); 255 EMIT(OEND, 0); 256 g->laststate = THERE(); 257 258 /* tidy up loose ends and fill things in */ 259 stripsnug(p, g); 260 findmust(p, g); 261 /* 262 * only use Boyer-Moore algorithm if the pattern is bigger 263 * than three characters 264 */ 265 if (g->mlen > 3) { 266 computejumps(p, g); 267 computematchjumps(p, g); 268 if (g->matchjump == NULL && g->charjump != NULL) { 269 free(g->charjump); 270 g->charjump = NULL; 271 } 272 } 273 g->nplus = pluscount(p, g); 274 g->magic = MAGIC2; 275 preg->re_nsub = g->nsub; 276 preg->re_g = g; 277 preg->re_magic = MAGIC1; 278 #ifndef REDEBUG 279 /* not debugging, so can't rely on the assert() in regexec() */ 280 if (g->iflags&BAD) 281 SETERROR(REG_EFATAL); 282 #endif 283 284 /* win or lose, we're done */ 285 if (p->error != 0) /* lose */ 286 regfree(preg); 287 return (p->error); 288 } 289 290 /* 291 * p_ere - ERE parser top level, concatenation and alternation 292 */ 293 static void 294 p_ere(struct parse *p, 295 int stop) /* character this ERE should end at */ 296 { 297 char c; 298 sopno prevback; 299 sopno prevfwd; 300 sopno conc; 301 int first = 1; /* is this the first alternative? */ 302 303 for (;;) { 304 /* do a bunch of concatenated expressions */ 305 conc = HERE(); 306 while (MORE() && (c = PEEK()) != '|' && c != stop) 307 p_ere_exp(p); 308 /* require nonempty */ 309 (void) REQUIRE(HERE() != conc, REG_BADPAT); 310 311 if (!EAT('|')) 312 break; /* NOTE BREAK OUT */ 313 314 if (first) { 315 INSERT(OCH_, conc); /* offset is wrong */ 316 prevfwd = conc; 317 prevback = conc; 318 first = 0; 319 } 320 ASTERN(OOR1, prevback); 321 prevback = THERE(); 322 AHEAD(prevfwd); /* fix previous offset */ 323 prevfwd = HERE(); 324 EMIT(OOR2, 0); /* offset is very wrong */ 325 } 326 327 if (!first) { /* tail-end fixups */ 328 AHEAD(prevfwd); 329 ASTERN(O_CH, prevback); 330 } 331 332 assert(!MORE() || SEE(stop)); 333 } 334 335 /* 336 * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 337 */ 338 static void 339 p_ere_exp(struct parse *p) 340 { 341 char c; 342 wint_t wc; 343 sopno pos; 344 int count; 345 int count2; 346 sopno subno; 347 int wascaret = 0; 348 349 assert(MORE()); /* caller should have ensured this */ 350 c = GETNEXT(); 351 352 pos = HERE(); 353 switch (c) { 354 case '(': 355 (void) REQUIRE(MORE(), REG_EPAREN); 356 p->g->nsub++; 357 subno = p->g->nsub; 358 if (subno < NPAREN) 359 p->pbegin[subno] = HERE(); 360 EMIT(OLPAREN, subno); 361 if (!SEE(')')) 362 p_ere(p, ')'); 363 if (subno < NPAREN) { 364 p->pend[subno] = HERE(); 365 assert(p->pend[subno] != 0); 366 } 367 EMIT(ORPAREN, subno); 368 (void) MUSTEAT(')', REG_EPAREN); 369 break; 370 #ifndef POSIX_MISTAKE 371 case ')': /* happens only if no current unmatched ( */ 372 /* 373 * You may ask, why the ifndef? Because I didn't notice 374 * this until slightly too late for 1003.2, and none of the 375 * other 1003.2 regular-expression reviewers noticed it at 376 * all. So an unmatched ) is legal POSIX, at least until 377 * we can get it fixed. 378 */ 379 SETERROR(REG_EPAREN); 380 break; 381 #endif 382 case '^': 383 EMIT(OBOL, 0); 384 p->g->iflags |= USEBOL; 385 p->g->nbol++; 386 wascaret = 1; 387 break; 388 case '$': 389 EMIT(OEOL, 0); 390 p->g->iflags |= USEEOL; 391 p->g->neol++; 392 break; 393 case '|': 394 SETERROR(REG_BADPAT); 395 break; 396 case '*': 397 case '+': 398 case '?': 399 case '{': 400 SETERROR(REG_BADRPT); 401 break; 402 case '.': 403 if (p->g->cflags®_NEWLINE) 404 nonnewline(p); 405 else 406 EMIT(OANY, 0); 407 break; 408 case '[': 409 p_bracket(p); 410 break; 411 case '\\': 412 (void) REQUIRE(MORE(), REG_EESCAPE); 413 wc = WGETNEXT(); 414 switch (wc) { 415 case '<': 416 EMIT(OBOW, 0); 417 break; 418 case '>': 419 EMIT(OEOW, 0); 420 break; 421 default: 422 ordinary(p, wc); 423 break; 424 } 425 break; 426 default: 427 if (p->error != 0) 428 return; 429 p->next--; 430 wc = WGETNEXT(); 431 ordinary(p, wc); 432 break; 433 } 434 435 if (!MORE()) 436 return; 437 c = PEEK(); 438 /* we call { a repetition if followed by a digit */ 439 if (!(c == '*' || c == '+' || c == '?' || c == '{')) 440 return; /* no repetition, we're done */ 441 else if (c == '{') 442 (void) REQUIRE(MORE2() && \ 443 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); 444 NEXT(); 445 446 (void) REQUIRE(!wascaret, REG_BADRPT); 447 switch (c) { 448 case '*': /* implemented as +? */ 449 /* this case does not require the (y|) trick, noKLUDGE */ 450 INSERT(OPLUS_, pos); 451 ASTERN(O_PLUS, pos); 452 INSERT(OQUEST_, pos); 453 ASTERN(O_QUEST, pos); 454 break; 455 case '+': 456 INSERT(OPLUS_, pos); 457 ASTERN(O_PLUS, pos); 458 break; 459 case '?': 460 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 461 INSERT(OCH_, pos); /* offset slightly wrong */ 462 ASTERN(OOR1, pos); /* this one's right */ 463 AHEAD(pos); /* fix the OCH_ */ 464 EMIT(OOR2, 0); /* offset very wrong... */ 465 AHEAD(THERE()); /* ...so fix it */ 466 ASTERN(O_CH, THERETHERE()); 467 break; 468 case '{': 469 count = p_count(p); 470 if (EAT(',')) { 471 if (isdigit((uch)PEEK())) { 472 count2 = p_count(p); 473 (void) REQUIRE(count <= count2, REG_BADBR); 474 } else /* single number with comma */ 475 count2 = INFINITY; 476 } else /* just a single number */ 477 count2 = count; 478 repeat(p, pos, count, count2); 479 if (!EAT('}')) { /* error heuristics */ 480 while (MORE() && PEEK() != '}') 481 NEXT(); 482 (void) REQUIRE(MORE(), REG_EBRACE); 483 SETERROR(REG_BADBR); 484 } 485 break; 486 } 487 488 if (!MORE()) 489 return; 490 c = PEEK(); 491 if (!(c == '*' || c == '+' || c == '?' || 492 (c == '{' && MORE2() && isdigit((uch)PEEK2())))) 493 return; 494 SETERROR(REG_BADRPT); 495 } 496 497 /* 498 * p_str - string (no metacharacters) "parser" 499 */ 500 static void 501 p_str(struct parse *p) 502 { 503 (void) REQUIRE(MORE(), REG_BADPAT); 504 while (MORE()) 505 ordinary(p, WGETNEXT()); 506 } 507 508 /* 509 * p_bre - BRE parser top level, anchoring and concatenation 510 * Giving end1 as OUT essentially eliminates the end1/end2 check. 511 * 512 * This implementation is a bit of a kludge, in that a trailing $ is first 513 * taken as an ordinary character and then revised to be an anchor. 514 * The amount of lookahead needed to avoid this kludge is excessive. 515 */ 516 static void 517 p_bre(struct parse *p, 518 int end1, /* first terminating character */ 519 int end2) /* second terminating character */ 520 { 521 sopno start = HERE(); 522 int first = 1; /* first subexpression? */ 523 int wasdollar = 0; 524 525 if (EAT('^')) { 526 EMIT(OBOL, 0); 527 p->g->iflags |= USEBOL; 528 p->g->nbol++; 529 } 530 while (MORE() && !SEETWO(end1, end2)) { 531 wasdollar = p_simp_re(p, first); 532 first = 0; 533 } 534 if (wasdollar) { /* oops, that was a trailing anchor */ 535 DROP(1); 536 EMIT(OEOL, 0); 537 p->g->iflags |= USEEOL; 538 p->g->neol++; 539 } 540 541 (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */ 542 } 543 544 /* 545 * p_simp_re - parse a simple RE, an atom possibly followed by a repetition 546 */ 547 static int /* was the simple RE an unbackslashed $? */ 548 p_simp_re(struct parse *p, 549 int starordinary) /* is a leading * an ordinary character? */ 550 { 551 int c; 552 int count; 553 int count2; 554 sopno pos; 555 int i; 556 wint_t wc; 557 sopno subno; 558 #define BACKSL (1<<CHAR_BIT) 559 560 pos = HERE(); /* repetition op, if any, covers from here */ 561 562 assert(MORE()); /* caller should have ensured this */ 563 c = GETNEXT(); 564 if (c == '\\') { 565 (void) REQUIRE(MORE(), REG_EESCAPE); 566 c = BACKSL | GETNEXT(); 567 } 568 switch (c) { 569 case '.': 570 if (p->g->cflags®_NEWLINE) 571 nonnewline(p); 572 else 573 EMIT(OANY, 0); 574 break; 575 case '[': 576 p_bracket(p); 577 break; 578 case BACKSL|'<': 579 EMIT(OBOW, 0); 580 break; 581 case BACKSL|'>': 582 EMIT(OEOW, 0); 583 break; 584 case BACKSL|'{': 585 SETERROR(REG_BADRPT); 586 break; 587 case BACKSL|'(': 588 p->g->nsub++; 589 subno = p->g->nsub; 590 if (subno < NPAREN) 591 p->pbegin[subno] = HERE(); 592 EMIT(OLPAREN, subno); 593 /* the MORE here is an error heuristic */ 594 if (MORE() && !SEETWO('\\', ')')) 595 p_bre(p, '\\', ')'); 596 if (subno < NPAREN) { 597 p->pend[subno] = HERE(); 598 assert(p->pend[subno] != 0); 599 } 600 EMIT(ORPAREN, subno); 601 (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 602 break; 603 case BACKSL|')': /* should not get here -- must be user */ 604 SETERROR(REG_EPAREN); 605 break; 606 case BACKSL|'1': 607 case BACKSL|'2': 608 case BACKSL|'3': 609 case BACKSL|'4': 610 case BACKSL|'5': 611 case BACKSL|'6': 612 case BACKSL|'7': 613 case BACKSL|'8': 614 case BACKSL|'9': 615 i = (c&~BACKSL) - '0'; 616 assert(i < NPAREN); 617 if (p->pend[i] != 0) { 618 assert(i <= p->g->nsub); 619 EMIT(OBACK_, i); 620 assert(p->pbegin[i] != 0); 621 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 622 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 623 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 624 EMIT(O_BACK, i); 625 } else 626 SETERROR(REG_ESUBREG); 627 p->g->backrefs = 1; 628 break; 629 case '*': 630 (void) REQUIRE(starordinary, REG_BADRPT); 631 /* FALLTHROUGH */ 632 default: 633 if (p->error != 0) 634 return (0); /* Definitely not $... */ 635 p->next--; 636 wc = WGETNEXT(); 637 ordinary(p, wc); 638 break; 639 } 640 641 if (EAT('*')) { /* implemented as +? */ 642 /* this case does not require the (y|) trick, noKLUDGE */ 643 INSERT(OPLUS_, pos); 644 ASTERN(O_PLUS, pos); 645 INSERT(OQUEST_, pos); 646 ASTERN(O_QUEST, pos); 647 } else if (EATTWO('\\', '{')) { 648 count = p_count(p); 649 if (EAT(',')) { 650 if (MORE() && isdigit((uch)PEEK())) { 651 count2 = p_count(p); 652 (void) REQUIRE(count <= count2, REG_BADBR); 653 } else /* single number with comma */ 654 count2 = INFINITY; 655 } else /* just a single number */ 656 count2 = count; 657 repeat(p, pos, count, count2); 658 if (!EATTWO('\\', '}')) { /* error heuristics */ 659 while (MORE() && !SEETWO('\\', '}')) 660 NEXT(); 661 (void) REQUIRE(MORE(), REG_EBRACE); 662 SETERROR(REG_BADBR); 663 } 664 } else if (c == '$') /* $ (but not \$) ends it */ 665 return (1); 666 667 return (0); 668 } 669 670 /* 671 * p_count - parse a repetition count 672 */ 673 static int /* the value */ 674 p_count(struct parse *p) 675 { 676 int count = 0; 677 int ndigits = 0; 678 679 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 680 count = count*10 + (GETNEXT() - '0'); 681 ndigits++; 682 } 683 684 (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 685 return (count); 686 } 687 688 /* 689 * p_bracket - parse a bracketed character list 690 */ 691 static void 692 p_bracket(struct parse *p) 693 { 694 cset *cs; 695 wint_t ch; 696 697 /* Dept of Truly Sickening Special-Case Kludges */ 698 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 699 EMIT(OBOW, 0); 700 NEXTn(6); 701 return; 702 } 703 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 704 EMIT(OEOW, 0); 705 NEXTn(6); 706 return; 707 } 708 709 if ((cs = allocset(p)) == NULL) 710 return; 711 712 if (p->g->cflags®_ICASE) 713 cs->icase = 1; 714 if (EAT('^')) 715 cs->invert = 1; 716 if (EAT(']')) 717 CHadd(p, cs, ']'); 718 else if (EAT('-')) 719 CHadd(p, cs, '-'); 720 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 721 p_b_term(p, cs); 722 if (EAT('-')) 723 CHadd(p, cs, '-'); 724 (void) MUSTEAT(']', REG_EBRACK); 725 726 if (p->error != 0) /* don't mess things up further */ 727 return; 728 729 if (cs->invert && p->g->cflags®_NEWLINE) 730 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 731 732 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 733 ordinary(p, ch); 734 freeset(p, cs); 735 } else 736 EMIT(OANYOF, (int)(cs - p->g->sets)); 737 } 738 739 /* 740 * p_b_term - parse one term of a bracketed character list 741 */ 742 static void 743 p_b_term(struct parse *p, cset *cs) 744 { 745 char c; 746 wint_t start, finish; 747 wint_t i; 748 locale_t loc = uselocale(NULL); 749 750 /* classify what we've got */ 751 switch ((MORE()) ? PEEK() : '\0') { 752 case '[': 753 c = (MORE2()) ? PEEK2() : '\0'; 754 break; 755 case '-': 756 SETERROR(REG_ERANGE); 757 return; /* NOTE RETURN */ 758 default: 759 c = '\0'; 760 break; 761 } 762 763 switch (c) { 764 case ':': /* character class */ 765 NEXT2(); 766 (void) REQUIRE(MORE(), REG_EBRACK); 767 c = PEEK(); 768 (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE); 769 p_b_cclass(p, cs); 770 (void) REQUIRE(MORE(), REG_EBRACK); 771 (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 772 break; 773 case '=': /* equivalence class */ 774 NEXT2(); 775 (void) REQUIRE(MORE(), REG_EBRACK); 776 c = PEEK(); 777 (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 778 p_b_eclass(p, cs); 779 (void) REQUIRE(MORE(), REG_EBRACK); 780 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 781 break; 782 default: /* symbol, ordinary character, or range */ 783 start = p_b_symbol(p); 784 if (SEE('-') && MORE2() && PEEK2() != ']') { 785 /* range */ 786 NEXT(); 787 if (EAT('-')) 788 finish = '-'; 789 else 790 finish = p_b_symbol(p); 791 } else 792 finish = start; 793 if (start == finish) 794 CHadd(p, cs, start); 795 else { 796 if (loc->collate->lc_is_posix) { 797 (void) REQUIRE((uch)start <= (uch)finish, 798 REG_ERANGE); 799 CHaddrange(p, cs, start, finish); 800 } else { 801 (void) REQUIRE(_collate_range_cmp(start, 802 finish, loc) <= 0, REG_ERANGE); 803 for (i = 0; i <= UCHAR_MAX; i++) { 804 if (_collate_range_cmp(start, i, loc) 805 <= 0 && 806 _collate_range_cmp(i, finish, loc) 807 <= 0) 808 CHadd(p, cs, i); 809 } 810 } 811 } 812 break; 813 } 814 } 815 816 /* 817 * p_b_cclass - parse a character-class name and deal with it 818 */ 819 static void 820 p_b_cclass(struct parse *p, cset *cs) 821 { 822 const char *sp = p->next; 823 size_t len; 824 wctype_t wct; 825 char clname[16]; 826 827 while (MORE() && isalpha((uch)PEEK())) 828 NEXT(); 829 len = p->next - sp; 830 if (len >= sizeof (clname) - 1) { 831 SETERROR(REG_ECTYPE); 832 return; 833 } 834 (void) memcpy(clname, sp, len); 835 clname[len] = '\0'; 836 if ((wct = wctype(clname)) == 0) { 837 SETERROR(REG_ECTYPE); 838 return; 839 } 840 CHaddtype(p, cs, wct); 841 } 842 843 /* 844 * p_b_eclass - parse an equivalence-class name and deal with it 845 * 846 * This implementation is incomplete. xxx 847 */ 848 static void 849 p_b_eclass(struct parse *p, cset *cs) 850 { 851 wint_t c; 852 853 c = p_b_coll_elem(p, '='); 854 CHadd(p, cs, c); 855 } 856 857 /* 858 * p_b_symbol - parse a character or [..]ed multicharacter collating symbol 859 */ 860 static wint_t /* value of symbol */ 861 p_b_symbol(struct parse *p) 862 { 863 wint_t value; 864 865 (void) REQUIRE(MORE(), REG_EBRACK); 866 if (!EATTWO('[', '.')) 867 return (WGETNEXT()); 868 869 /* collating symbol */ 870 value = p_b_coll_elem(p, '.'); 871 (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 872 return (value); 873 } 874 875 /* 876 * p_b_coll_elem - parse a collating-element name and look it up 877 */ 878 static wint_t /* value of collating element */ 879 p_b_coll_elem(struct parse *p, 880 wint_t endc) /* name ended by endc,']' */ 881 { 882 const char *sp = p->next; 883 struct cname *cp; 884 mbstate_t mbs; 885 wchar_t wc; 886 size_t clen, len; 887 888 while (MORE() && !SEETWO(endc, ']')) 889 NEXT(); 890 if (!MORE()) { 891 SETERROR(REG_EBRACK); 892 return (0); 893 } 894 len = p->next - sp; 895 for (cp = cnames; cp->name != NULL; cp++) 896 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 897 return (cp->code); /* known name */ 898 (void) memset(&mbs, 0, sizeof (mbs)); 899 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 900 return (wc); /* single character */ 901 else if (clen == (size_t)-1 || clen == (size_t)-2) 902 SETERROR(REG_ECHAR); 903 else 904 SETERROR(REG_ECOLLATE); /* neither */ 905 return (0); 906 } 907 908 /* 909 * othercase - return the case counterpart of an alphabetic 910 */ 911 static wint_t /* if no counterpart, return ch */ 912 othercase(wint_t ch) 913 { 914 assert(iswalpha(ch)); 915 if (iswupper(ch)) 916 return (towlower(ch)); 917 else if (iswlower(ch)) 918 return (towupper(ch)); 919 else /* peculiar, but could happen */ 920 return (ch); 921 } 922 923 /* 924 * bothcases - emit a dualcase version of a two-case character 925 * 926 * Boy, is this implementation ever a kludge... 927 */ 928 static void 929 bothcases(struct parse *p, wint_t ch) 930 { 931 const char *oldnext = p->next; 932 const char *oldend = p->end; 933 char bracket[3 + MB_LEN_MAX]; 934 size_t n; 935 mbstate_t mbs; 936 937 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 938 p->next = bracket; 939 (void) memset(&mbs, 0, sizeof (mbs)); 940 n = wcrtomb(bracket, ch, &mbs); 941 assert(n != (size_t)-1); 942 bracket[n] = ']'; 943 bracket[n + 1] = '\0'; 944 p->end = bracket+n+1; 945 p_bracket(p); 946 assert(p->next == p->end); 947 p->next = oldnext; 948 p->end = oldend; 949 } 950 951 /* 952 * ordinary - emit an ordinary character 953 */ 954 static void 955 ordinary(struct parse *p, wint_t ch) 956 { 957 cset *cs; 958 959 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 960 bothcases(p, ch); 961 else if ((ch & OPDMASK) == ch) 962 EMIT(OCHAR, ch); 963 else { 964 /* 965 * Kludge: character is too big to fit into an OCHAR operand. 966 * Emit a singleton set. 967 */ 968 if ((cs = allocset(p)) == NULL) 969 return; 970 CHadd(p, cs, ch); 971 EMIT(OANYOF, (int)(cs - p->g->sets)); 972 } 973 } 974 975 /* 976 * nonnewline - emit REG_NEWLINE version of OANY 977 * 978 * Boy, is this implementation ever a kludge... 979 */ 980 static void 981 nonnewline(struct parse *p) 982 { 983 const char *oldnext = p->next; 984 const char *oldend = p->end; 985 char bracket[4]; 986 987 p->next = bracket; 988 p->end = bracket+3; 989 bracket[0] = '^'; 990 bracket[1] = '\n'; 991 bracket[2] = ']'; 992 bracket[3] = '\0'; 993 p_bracket(p); 994 assert(p->next == bracket+3); 995 p->next = oldnext; 996 p->end = oldend; 997 } 998 999 /* 1000 * repeat - generate code for a bounded repetition, recursively if needed 1001 */ 1002 static void 1003 repeat(struct parse *p, 1004 sopno start, /* operand from here to end of strip */ 1005 int from, /* repeated from this number */ 1006 int to) /* to this number of times (maybe INFINITY) */ 1007 { 1008 sopno finish = HERE(); 1009 #define N 2 1010 #define INF 3 1011 #define REP(f, t) ((f)*8 + (t)) 1012 #define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1013 sopno copy; 1014 1015 if (p->error != 0) /* head off possible runaway recursion */ 1016 return; 1017 1018 assert(from <= to); 1019 1020 switch (REP(MAP(from), MAP(to))) { 1021 case REP(0, 0): /* must be user doing this */ 1022 DROP(finish-start); /* drop the operand */ 1023 break; 1024 case REP(0, 1): /* as x{1,1}? */ 1025 case REP(0, N): /* as x{1,n}? */ 1026 case REP(0, INF): /* as x{1,}? */ 1027 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1028 INSERT(OCH_, start); /* offset is wrong... */ 1029 repeat(p, start+1, 1, to); 1030 ASTERN(OOR1, start); 1031 AHEAD(start); /* ... fix it */ 1032 EMIT(OOR2, 0); 1033 AHEAD(THERE()); 1034 ASTERN(O_CH, THERETHERE()); 1035 break; 1036 case REP(1, 1): /* trivial case */ 1037 /* done */ 1038 break; 1039 case REP(1, N): /* as x?x{1,n-1} */ 1040 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1041 INSERT(OCH_, start); 1042 ASTERN(OOR1, start); 1043 AHEAD(start); 1044 EMIT(OOR2, 0); /* offset very wrong... */ 1045 AHEAD(THERE()); /* ...so fix it */ 1046 ASTERN(O_CH, THERETHERE()); 1047 copy = dupl(p, start+1, finish+1); 1048 assert(copy == finish+4); 1049 repeat(p, copy, 1, to-1); 1050 break; 1051 case REP(1, INF): /* as x+ */ 1052 INSERT(OPLUS_, start); 1053 ASTERN(O_PLUS, start); 1054 break; 1055 case REP(N, N): /* as xx{m-1,n-1} */ 1056 copy = dupl(p, start, finish); 1057 repeat(p, copy, from-1, to-1); 1058 break; 1059 case REP(N, INF): /* as xx{n-1,INF} */ 1060 copy = dupl(p, start, finish); 1061 repeat(p, copy, from-1, to); 1062 break; 1063 default: /* "can't happen" */ 1064 SETERROR(REG_EFATAL); /* just in case */ 1065 break; 1066 } 1067 } 1068 1069 /* 1070 * wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1071 * character from the parse struct, signals a REG_ILLSEQ error if the 1072 * character can't be converted. Returns the number of bytes consumed. 1073 */ 1074 static wint_t 1075 wgetnext(struct parse *p) 1076 { 1077 mbstate_t mbs; 1078 wchar_t wc; 1079 size_t n; 1080 1081 (void) memset(&mbs, 0, sizeof (mbs)); 1082 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1083 if (n == (size_t)-1 || n == (size_t)-2) { 1084 SETERROR(REG_ECHAR); 1085 return (0); 1086 } 1087 if (n == 0) 1088 n = 1; 1089 p->next += n; 1090 return (wc); 1091 } 1092 1093 /* 1094 * seterr - set an error condition 1095 */ 1096 static int /* useless but makes type checking happy */ 1097 seterr(struct parse *p, int e) 1098 { 1099 if (p->error == 0) /* keep earliest error condition */ 1100 p->error = e; 1101 p->next = nuls; /* try to bring things to a halt */ 1102 p->end = nuls; 1103 return (0); /* make the return value well-defined */ 1104 } 1105 1106 /* 1107 * allocset - allocate a set of characters for [] 1108 */ 1109 static cset * 1110 allocset(struct parse *p) 1111 { 1112 cset *cs, *ncs; 1113 1114 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs)); 1115 if (ncs == NULL) { 1116 SETERROR(REG_ESPACE); 1117 return (NULL); 1118 } 1119 p->g->sets = ncs; 1120 cs = &p->g->sets[p->g->ncsets++]; 1121 (void) memset(cs, 0, sizeof (*cs)); 1122 1123 return (cs); 1124 } 1125 1126 /* 1127 * freeset - free a now-unused set 1128 */ 1129 static void 1130 freeset(struct parse *p, cset *cs) 1131 { 1132 cset *top = &p->g->sets[p->g->ncsets]; 1133 1134 free(cs->wides); 1135 free(cs->ranges); 1136 free(cs->types); 1137 (void) memset(cs, 0, sizeof (*cs)); 1138 if (cs == top-1) /* recover only the easy case */ 1139 p->g->ncsets--; 1140 } 1141 1142 /* 1143 * singleton - Determine whether a set contains only one character, 1144 * returning it if so, otherwise returning OUT. 1145 */ 1146 static wint_t 1147 singleton(cset *cs) 1148 { 1149 wint_t i, s, n; 1150 1151 for (i = n = 0; i < NC; i++) 1152 if (CHIN(cs, i)) { 1153 n++; 1154 s = i; 1155 } 1156 if (n == 1) 1157 return (s); 1158 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1159 cs->icase == 0) 1160 return (cs->wides[0]); 1161 /* Don't bother handling the other cases. */ 1162 return (OUT); 1163 } 1164 1165 /* 1166 * CHadd - add character to character set. 1167 */ 1168 static void 1169 CHadd(struct parse *p, cset *cs, wint_t ch) 1170 { 1171 wint_t nch, *newwides; 1172 assert(ch >= 0); 1173 if (ch < NC) 1174 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1175 else { 1176 newwides = realloc(cs->wides, (cs->nwides + 1) * 1177 sizeof (*cs->wides)); 1178 if (newwides == NULL) { 1179 SETERROR(REG_ESPACE); 1180 return; 1181 } 1182 cs->wides = newwides; 1183 cs->wides[cs->nwides++] = ch; 1184 } 1185 if (cs->icase) { 1186 if ((nch = towlower(ch)) < NC) 1187 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1188 if ((nch = towupper(ch)) < NC) 1189 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1190 } 1191 } 1192 1193 /* 1194 * CHaddrange - add all characters in the range [min,max] to a character set. 1195 */ 1196 static void 1197 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1198 { 1199 crange *newranges; 1200 1201 for (; min < NC && min <= max; min++) 1202 CHadd(p, cs, min); 1203 if (min >= max) 1204 return; 1205 newranges = realloc(cs->ranges, (cs->nranges + 1) * 1206 sizeof (*cs->ranges)); 1207 if (newranges == NULL) { 1208 SETERROR(REG_ESPACE); 1209 return; 1210 } 1211 cs->ranges = newranges; 1212 cs->ranges[cs->nranges].min = min; 1213 cs->ranges[cs->nranges].max = max; 1214 cs->nranges++; 1215 } 1216 1217 /* 1218 * CHaddtype - add all characters of a certain type to a character set. 1219 */ 1220 static void 1221 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1222 { 1223 wint_t i; 1224 wctype_t *newtypes; 1225 1226 for (i = 0; i < NC; i++) 1227 if (iswctype(i, wct)) 1228 CHadd(p, cs, i); 1229 newtypes = realloc(cs->types, (cs->ntypes + 1) * 1230 sizeof (*cs->types)); 1231 if (newtypes == NULL) { 1232 SETERROR(REG_ESPACE); 1233 return; 1234 } 1235 cs->types = newtypes; 1236 cs->types[cs->ntypes++] = wct; 1237 } 1238 1239 /* 1240 * dupl - emit a duplicate of a bunch of sops 1241 */ 1242 static sopno /* start of duplicate */ 1243 dupl(struct parse *p, 1244 sopno start, /* from here */ 1245 sopno finish) /* to this less one */ 1246 { 1247 sopno ret = HERE(); 1248 sopno len = finish - start; 1249 1250 assert(finish >= start); 1251 if (len == 0) 1252 return (ret); 1253 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1254 return (ret); 1255 assert(p->ssize >= p->slen + len); 1256 (void) memcpy((char *)(p->strip + p->slen), 1257 (char *)(p->strip + start), (size_t)len*sizeof (sop)); 1258 p->slen += len; 1259 return (ret); 1260 } 1261 1262 /* 1263 * doemit - emit a strip operator 1264 * 1265 * It might seem better to implement this as a macro with a function as 1266 * hard-case backup, but it's just too big and messy unless there are 1267 * some changes to the data structures. Maybe later. 1268 */ 1269 static void 1270 doemit(struct parse *p, sop op, size_t opnd) 1271 { 1272 /* avoid making error situations worse */ 1273 if (p->error != 0) 1274 return; 1275 1276 /* deal with oversize operands ("can't happen", more or less) */ 1277 assert(opnd < 1<<OPSHIFT); 1278 1279 /* deal with undersized strip */ 1280 if (p->slen >= p->ssize) 1281 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1282 return; 1283 1284 /* finally, it's all reduced to the easy case */ 1285 p->strip[p->slen++] = SOP(op, opnd); 1286 } 1287 1288 /* 1289 * doinsert - insert a sop into the strip 1290 */ 1291 static void 1292 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1293 { 1294 sopno sn; 1295 sop s; 1296 int i; 1297 1298 /* avoid making error situations worse */ 1299 if (p->error != 0) 1300 return; 1301 1302 sn = HERE(); 1303 EMIT(op, opnd); /* do checks, ensure space */ 1304 assert(HERE() == sn+1); 1305 s = p->strip[sn]; 1306 1307 /* adjust paren pointers */ 1308 assert(pos > 0); 1309 for (i = 1; i < NPAREN; i++) { 1310 if (p->pbegin[i] >= pos) { 1311 p->pbegin[i]++; 1312 } 1313 if (p->pend[i] >= pos) { 1314 p->pend[i]++; 1315 } 1316 } 1317 1318 (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1319 (HERE()-pos-1)*sizeof (sop)); 1320 p->strip[pos] = s; 1321 } 1322 1323 /* 1324 * dofwd - complete a forward reference 1325 */ 1326 static void 1327 dofwd(struct parse *p, sopno pos, sop value) 1328 { 1329 /* avoid making error situations worse */ 1330 if (p->error != 0) 1331 return; 1332 1333 assert(value < 1<<OPSHIFT); 1334 p->strip[pos] = OP(p->strip[pos]) | value; 1335 } 1336 1337 /* 1338 * enlarge - enlarge the strip 1339 */ 1340 static int 1341 enlarge(struct parse *p, sopno size) 1342 { 1343 sop *sp; 1344 1345 if (p->ssize >= size) 1346 return (1); 1347 1348 sp = (sop *)realloc(p->strip, size*sizeof (sop)); 1349 if (sp == NULL) { 1350 SETERROR(REG_ESPACE); 1351 return (0); 1352 } 1353 p->strip = sp; 1354 p->ssize = size; 1355 return (1); 1356 } 1357 1358 /* 1359 * stripsnug - compact the strip 1360 */ 1361 static void 1362 stripsnug(struct parse *p, struct re_guts *g) 1363 { 1364 g->nstates = p->slen; 1365 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop)); 1366 if (g->strip == NULL) { 1367 SETERROR(REG_ESPACE); 1368 g->strip = p->strip; 1369 } 1370 } 1371 1372 /* 1373 * findmust - fill in must and mlen with longest mandatory literal string 1374 * 1375 * This algorithm could do fancy things like analyzing the operands of | 1376 * for common subsequences. Someday. This code is simple and finds most 1377 * of the interesting cases. 1378 * 1379 * Note that must and mlen got initialized during setup. 1380 */ 1381 static void 1382 findmust(struct parse *p, struct re_guts *g) 1383 { 1384 sop *scan; 1385 sop *start = NULL; 1386 sop *newstart = NULL; 1387 sopno newlen; 1388 sop s; 1389 char *cp; 1390 int offset; 1391 char buf[MB_LEN_MAX]; 1392 size_t clen; 1393 mbstate_t mbs; 1394 locale_t loc = uselocale(NULL); 1395 1396 /* avoid making error situations worse */ 1397 if (p->error != 0) 1398 return; 1399 1400 /* 1401 * It's not generally safe to do a ``char'' substring search on 1402 * multibyte character strings, but it's safe for at least 1403 * UTF-8 (see RFC 3629). 1404 */ 1405 if (MB_CUR_MAX > 1 && 1406 strcmp(loc->runelocale->__encoding, "UTF-8") != 0) 1407 return; 1408 1409 /* find the longest OCHAR sequence in strip */ 1410 newlen = 0; 1411 offset = 0; 1412 g->moffset = 0; 1413 scan = g->strip + 1; 1414 do { 1415 s = *scan++; 1416 switch (OP(s)) { 1417 case OCHAR: /* sequence member */ 1418 if (newlen == 0) { /* new sequence */ 1419 (void) memset(&mbs, 0, sizeof (mbs)); 1420 newstart = scan - 1; 1421 } 1422 clen = wcrtomb(buf, OPND(s), &mbs); 1423 if (clen == (size_t)-1) 1424 goto toohard; 1425 newlen += clen; 1426 break; 1427 case OPLUS_: /* things that don't break one */ 1428 case OLPAREN: 1429 case ORPAREN: 1430 break; 1431 case OQUEST_: /* things that must be skipped */ 1432 case OCH_: 1433 offset = altoffset(scan, offset); 1434 scan--; 1435 do { 1436 scan += OPND(s); 1437 s = *scan; 1438 /* assert() interferes w debug printouts */ 1439 if (OP(s) != O_QUEST && OP(s) != O_CH && 1440 OP(s) != OOR2) { 1441 g->iflags |= BAD; 1442 return; 1443 } 1444 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1445 /* FALLTHROUGH */ 1446 case OBOW: /* things that break a sequence */ 1447 case OEOW: 1448 case OBOL: 1449 case OEOL: 1450 case O_QUEST: 1451 case O_CH: 1452 case OEND: 1453 if (newlen > g->mlen) { /* ends one */ 1454 start = newstart; 1455 g->mlen = newlen; 1456 if (offset > -1) { 1457 g->moffset += offset; 1458 offset = newlen; 1459 } else 1460 g->moffset = offset; 1461 } else { 1462 if (offset > -1) 1463 offset += newlen; 1464 } 1465 newlen = 0; 1466 break; 1467 case OANY: 1468 if (newlen > g->mlen) { /* ends one */ 1469 start = newstart; 1470 g->mlen = newlen; 1471 if (offset > -1) { 1472 g->moffset += offset; 1473 offset = newlen; 1474 } else 1475 g->moffset = offset; 1476 } else { 1477 if (offset > -1) 1478 offset += newlen; 1479 } 1480 if (offset > -1) 1481 offset++; 1482 newlen = 0; 1483 break; 1484 case OANYOF: /* may or may not invalidate offset */ 1485 /* First, everything as OANY */ 1486 if (newlen > g->mlen) { /* ends one */ 1487 start = newstart; 1488 g->mlen = newlen; 1489 if (offset > -1) { 1490 g->moffset += offset; 1491 offset = newlen; 1492 } else 1493 g->moffset = offset; 1494 } else { 1495 if (offset > -1) 1496 offset += newlen; 1497 } 1498 if (offset > -1) 1499 offset++; 1500 newlen = 0; 1501 break; 1502 toohard: 1503 default: 1504 /* 1505 * Anything here makes it impossible or too hard 1506 * to calculate the offset -- so we give up; 1507 * save the last known good offset, in case the 1508 * must sequence doesn't occur later. 1509 */ 1510 if (newlen > g->mlen) { /* ends one */ 1511 start = newstart; 1512 g->mlen = newlen; 1513 if (offset > -1) 1514 g->moffset += offset; 1515 else 1516 g->moffset = offset; 1517 } 1518 offset = -1; 1519 newlen = 0; 1520 break; 1521 } 1522 } while (OP(s) != OEND); 1523 1524 if (g->mlen == 0) { /* there isn't one */ 1525 g->moffset = -1; 1526 return; 1527 } 1528 1529 /* turn it into a character string */ 1530 g->must = malloc((size_t)g->mlen + 1); 1531 if (g->must == NULL) { /* argh; just forget it */ 1532 g->mlen = 0; 1533 g->moffset = -1; 1534 return; 1535 } 1536 cp = g->must; 1537 scan = start; 1538 (void) memset(&mbs, 0, sizeof (mbs)); 1539 while (cp < g->must + g->mlen) { 1540 while (OP(s = *scan++) != OCHAR) 1541 continue; 1542 clen = wcrtomb(cp, OPND(s), &mbs); 1543 assert(clen != (size_t)-1); 1544 cp += clen; 1545 } 1546 assert(cp == g->must + g->mlen); 1547 *cp++ = '\0'; /* just on general principles */ 1548 } 1549 1550 /* 1551 * altoffset - choose biggest offset among multiple choices 1552 * 1553 * Compute, recursively if necessary, the largest offset among multiple 1554 * re paths. 1555 */ 1556 static int 1557 altoffset(sop *scan, int offset) 1558 { 1559 int largest; 1560 int try; 1561 sop s; 1562 1563 /* If we gave up already on offsets, return */ 1564 if (offset == -1) 1565 return (-1); 1566 1567 largest = 0; 1568 try = 0; 1569 s = *scan++; 1570 while (OP(s) != O_QUEST && OP(s) != O_CH) { 1571 switch (OP(s)) { 1572 case OOR1: 1573 if (try > largest) 1574 largest = try; 1575 try = 0; 1576 break; 1577 case OQUEST_: 1578 case OCH_: 1579 try = altoffset(scan, try); 1580 if (try == -1) 1581 return (-1); 1582 scan--; 1583 do { 1584 scan += OPND(s); 1585 s = *scan; 1586 if (OP(s) != O_QUEST && OP(s) != O_CH && 1587 OP(s) != OOR2) 1588 return (-1); 1589 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1590 /* 1591 * We must skip to the next position, or we'll 1592 * leave altoffset() too early. 1593 */ 1594 scan++; 1595 break; 1596 case OANYOF: 1597 case OCHAR: 1598 case OANY: 1599 try++; 1600 /*FALLTHRU*/ 1601 case OBOW: 1602 case OEOW: 1603 case OLPAREN: 1604 case ORPAREN: 1605 case OOR2: 1606 break; 1607 default: 1608 try = -1; 1609 break; 1610 } 1611 if (try == -1) 1612 return (-1); 1613 s = *scan++; 1614 } 1615 1616 if (try > largest) 1617 largest = try; 1618 1619 return (largest+offset); 1620 } 1621 1622 /* 1623 * computejumps - compute char jumps for BM scan 1624 * 1625 * This algorithm assumes g->must exists and is has size greater than 1626 * zero. It's based on the algorithm found on Computer Algorithms by 1627 * Sara Baase. 1628 * 1629 * A char jump is the number of characters one needs to jump based on 1630 * the value of the character from the text that was mismatched. 1631 */ 1632 static void 1633 computejumps(struct parse *p, struct re_guts *g) 1634 { 1635 int ch; 1636 int mindex; 1637 1638 /* Avoid making errors worse */ 1639 if (p->error != 0) 1640 return; 1641 1642 g->charjump = (int *)malloc((NC + 1) * sizeof (int)); 1643 if (g->charjump == NULL) /* Not a fatal error */ 1644 return; 1645 /* Adjust for signed chars, if necessary */ 1646 g->charjump = &g->charjump[-(CHAR_MIN)]; 1647 1648 /* 1649 * If the character does not exist in the pattern, the jump 1650 * is equal to the number of characters in the pattern. 1651 */ 1652 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 1653 g->charjump[ch] = g->mlen; 1654 1655 /* 1656 * If the character does exist, compute the jump that would 1657 * take us to the last character in the pattern equal to it 1658 * (notice that we match right to left, so that last character 1659 * is the first one that would be matched). 1660 */ 1661 for (mindex = 0; mindex < g->mlen; mindex++) 1662 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 1663 } 1664 1665 /* 1666 * computematchjumps - compute match jumps for BM scan 1667 * 1668 * This algorithm assumes g->must exists and is has size greater than 1669 * zero. It's based on the algorithm found on Computer Algorithms by 1670 * Sara Baase. 1671 * 1672 * A match jump is the number of characters one needs to advance based 1673 * on the already-matched suffix. 1674 * Notice that all values here are minus (g->mlen-1), because of the way 1675 * the search algorithm works. 1676 */ 1677 static void 1678 computematchjumps(struct parse *p, struct re_guts *g) 1679 { 1680 int mindex; /* General "must" iterator */ 1681 int suffix; /* Keeps track of matching suffix */ 1682 int ssuffix; /* Keeps track of suffixes' suffix */ 1683 int *pmatches; 1684 /* 1685 * pmatches[k] points to the next i 1686 * such that i+1...mlen is a substring 1687 * of k+1...k+mlen-i-1 1688 */ 1689 1690 /* Avoid making errors worse */ 1691 if (p->error != 0) 1692 return; 1693 1694 pmatches = (int *)malloc(g->mlen * sizeof (unsigned int)); 1695 if (pmatches == NULL) { 1696 g->matchjump = NULL; 1697 return; 1698 } 1699 1700 g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int)); 1701 if (g->matchjump == NULL) { /* Not a fatal error */ 1702 free(pmatches); 1703 return; 1704 } 1705 1706 /* Set maximum possible jump for each character in the pattern */ 1707 for (mindex = 0; mindex < g->mlen; mindex++) 1708 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 1709 1710 /* Compute pmatches[] */ 1711 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 1712 mindex--, suffix--) { 1713 pmatches[mindex] = suffix; 1714 1715 /* 1716 * If a mismatch is found, interrupting the substring, 1717 * compute the matchjump for that position. If no 1718 * mismatch is found, then a text substring mismatched 1719 * against the suffix will also mismatch against the 1720 * substring. 1721 */ 1722 while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) { 1723 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1724 g->mlen - mindex - 1); 1725 suffix = pmatches[suffix]; 1726 } 1727 } 1728 1729 /* 1730 * Compute the matchjump up to the last substring found to jump 1731 * to the beginning of the largest must pattern prefix matching 1732 * it's own suffix. 1733 */ 1734 for (mindex = 0; mindex <= suffix; mindex++) 1735 g->matchjump[mindex] = MIN(g->matchjump[mindex], 1736 g->mlen + suffix - mindex); 1737 1738 ssuffix = pmatches[suffix]; 1739 while (suffix < g->mlen) { 1740 while (suffix <= ssuffix && suffix < g->mlen) { 1741 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1742 g->mlen + ssuffix - suffix); 1743 suffix++; 1744 } 1745 if (suffix < g->mlen) 1746 ssuffix = pmatches[ssuffix]; 1747 } 1748 1749 free(pmatches); 1750 } 1751 1752 /* 1753 * pluscount - count + nesting 1754 */ 1755 static sopno /* nesting depth */ 1756 pluscount(struct parse *p, struct re_guts *g) 1757 { 1758 sop *scan; 1759 sop s; 1760 sopno plusnest = 0; 1761 sopno maxnest = 0; 1762 1763 if (p->error != 0) 1764 return (0); /* there may not be an OEND */ 1765 1766 scan = g->strip + 1; 1767 do { 1768 s = *scan++; 1769 switch (OP(s)) { 1770 case OPLUS_: 1771 plusnest++; 1772 break; 1773 case O_PLUS: 1774 if (plusnest > maxnest) 1775 maxnest = plusnest; 1776 plusnest--; 1777 break; 1778 } 1779 } while (OP(s) != OEND); 1780 if (plusnest != 0) 1781 g->iflags |= BAD; 1782 return (maxnest); 1783 } 1784