1 /* 2 * Copyright 2013 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. 4 * Copyright 2012 Milan Jurik. All rights reserved. 5 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 6 * Copyright (c) 1992, 1993, 1994 7 * The Regents of the University of California. All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * Henry Spencer. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #include "lint.h" 38 #include "file64.h" 39 #include <sys/types.h> 40 #include <stdio.h> 41 #include <string.h> 42 #include <ctype.h> 43 #include <limits.h> 44 #include <regex.h> 45 #include <stdlib.h> 46 #include <stdbool.h> 47 #include <wchar.h> 48 #include <wctype.h> 49 50 #include "../locale/runetype.h" 51 #include "../locale/collate.h" 52 53 #include "utils.h" 54 #include "regex2.h" 55 56 #include "cname.h" 57 #include "../locale/mblocal.h" 58 59 /* 60 * Branching context, used to keep track of branch state for all of the branch- 61 * aware functions. In addition to keeping track of branch positions for the 62 * p_branch_* functions, we use this to simplify some clumsiness in BREs for 63 * detection of whether ^ is acting as an anchor or being used erroneously and 64 * also for whether we're in a sub-expression or not. 65 */ 66 struct branchc { 67 sopno start; 68 sopno back; 69 sopno fwd; 70 71 int nbranch; 72 int nchain; 73 bool outer; 74 bool terminate; 75 }; 76 77 /* 78 * parse structure, passed up and down to avoid global variables and 79 * other clumsinesses 80 */ 81 struct parse { 82 const char *next; /* next character in RE */ 83 const char *end; /* end of string (-> NUL normally) */ 84 int error; /* has an error been seen? */ 85 sop *strip; /* malloced strip */ 86 sopno ssize; /* malloced strip size (allocated) */ 87 sopno slen; /* malloced strip length (used) */ 88 int ncsalloc; /* number of csets allocated */ 89 struct re_guts *g; 90 #define NPAREN 10 /* we need to remember () 1-9 for back refs */ 91 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 92 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 93 bool allowbranch; /* can this expression branch? */ 94 bool bre; /* convenience; is this a BRE? */ 95 bool (*parse_expr)(struct parse *, struct branchc *); 96 void (*pre_parse)(struct parse *, struct branchc *); 97 void (*post_parse)(struct parse *, struct branchc *); 98 }; 99 100 /* ========= begin header generated by ./mkh ========= */ 101 #ifdef __cplusplus 102 extern "C" { 103 #endif 104 105 /* === regcomp.c === */ 106 static bool p_ere_exp(struct parse *p, struct branchc *bc); 107 static void p_str(struct parse *p); 108 static int p_branch_eat_delim(struct parse *p, struct branchc *bc); 109 static void p_branch_ins_offset(struct parse *p, struct branchc *bc); 110 static void p_branch_fix_tail(struct parse *p, struct branchc *bc); 111 static bool p_branch_empty(struct parse *p, struct branchc *bc); 112 static bool p_branch_do(struct parse *p, struct branchc *bc); 113 static void p_bre_pre_parse(struct parse *p, struct branchc *bc); 114 static void p_bre_post_parse(struct parse *p, struct branchc *bc); 115 static void p_re(struct parse *p, int end1, int end2); 116 static bool p_simp_re(struct parse *p, struct branchc *bc); 117 static int p_count(struct parse *p); 118 static void p_bracket(struct parse *p); 119 static void p_b_term(struct parse *p, cset *cs); 120 static void p_b_cclass(struct parse *p, cset *cs); 121 static void p_b_eclass(struct parse *p, cset *cs); 122 static wint_t p_b_symbol(struct parse *p); 123 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 124 static wint_t othercase(wint_t ch); 125 static void bothcases(struct parse *p, wint_t ch); 126 static void ordinary(struct parse *p, wint_t ch); 127 static void nonnewline(struct parse *p); 128 static void repeat(struct parse *p, sopno start, int from, int to); 129 static int seterr(struct parse *p, int e); 130 static cset *allocset(struct parse *p); 131 static void freeset(struct parse *p, cset *cs); 132 static void CHadd(struct parse *p, cset *cs, wint_t ch); 133 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 134 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 135 static wint_t singleton(cset *cs); 136 static sopno dupl(struct parse *p, sopno start, sopno finish); 137 static void doemit(struct parse *p, sop op, size_t opnd); 138 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 139 static void dofwd(struct parse *p, sopno pos, sop value); 140 static int enlarge(struct parse *p, sopno size); 141 static void stripsnug(struct parse *p, struct re_guts *g); 142 static void findmust(struct parse *p, struct re_guts *g); 143 static int altoffset(sop *scan, int offset); 144 static void computejumps(struct parse *p, struct re_guts *g); 145 static void computematchjumps(struct parse *p, struct re_guts *g); 146 static sopno pluscount(struct parse *p, struct re_guts *g); 147 static wint_t wgetnext(struct parse *p); 148 149 #ifdef __cplusplus 150 } 151 #endif 152 /* ========= end header generated by ./mkh ========= */ 153 154 static char nuls[10]; /* place to point scanner in event of error */ 155 156 /* 157 * macros for use with parse structure 158 * BEWARE: these know that the parse structure is named `p' !!! 159 */ 160 #define PEEK() (*p->next) 161 #define PEEK2() (*(p->next+1)) 162 #define MORE() (p->next < p->end) 163 #define MORE2() (p->next+1 < p->end) 164 #define SEE(c) (MORE() && PEEK() == (c)) 165 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 166 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) 167 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 168 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 169 #define NEXT() (p->next++) 170 #define NEXT2() (p->next += 2) 171 #define NEXTn(n) (p->next += (n)) 172 #define GETNEXT() (*p->next++) 173 #define WGETNEXT() wgetnext(p) 174 #define SETERROR(e) ((void)seterr(p, (e))) 175 #define REQUIRE(co, e) ((co) || seterr(p, e)) 176 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 177 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 178 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 179 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 180 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 181 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 182 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 183 #define HERE() (p->slen) 184 #define THERE() (p->slen - 1) 185 #define THERETHERE() (p->slen - 2) 186 #define DROP(n) (p->slen -= (n)) 187 188 #ifndef NDEBUG 189 static int never = 0; /* for use in asserts; shuts lint up */ 190 #else 191 #define never 0 /* some <assert.h>s have bugs too */ 192 #endif 193 194 /* 195 * regcomp - interface for parser and compilation 196 */ 197 int /* 0 success, otherwise REG_something */ 198 regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern, 199 int cflags) 200 { 201 struct parse pa; 202 struct re_guts *g; 203 struct parse *p = &pa; 204 int i; 205 size_t len; 206 size_t maxlen; 207 #ifdef REDEBUG 208 #define GOODFLAGS(f) (f) 209 #else 210 #define GOODFLAGS(f) ((f)&~REG_DUMP) 211 #endif 212 213 /* We had REG_INVARG, but we don't have that on Solaris. */ 214 cflags = GOODFLAGS(cflags); 215 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 216 return (REG_EFATAL); 217 218 if (cflags®_PEND) { 219 if (preg->re_endp < pattern) 220 return (REG_EFATAL); 221 len = preg->re_endp - pattern; 222 } else 223 len = strlen(pattern); 224 225 /* do the mallocs early so failure handling is easy */ 226 g = (struct re_guts *)malloc(sizeof (struct re_guts)); 227 if (g == NULL) 228 return (REG_ESPACE); 229 /* 230 * Limit the pattern space to avoid a 32-bit overflow on buffer 231 * extension. Also avoid any signed overflow in case of conversion 232 * so make the real limit based on a 31-bit overflow. 233 * 234 * Likely not applicable on 64-bit systems but handle the case 235 * generically (who are we to stop people from using ~715MB+ 236 * patterns?). 237 */ 238 maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3; 239 if (len >= maxlen) { 240 free((char *)g); 241 return (REG_ESPACE); 242 } 243 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 244 assert(p->ssize >= len); 245 246 p->strip = (sop *)malloc(p->ssize * sizeof (sop)); 247 p->slen = 0; 248 if (p->strip == NULL) { 249 free((char *)g); 250 return (REG_ESPACE); 251 } 252 253 /* set things up */ 254 p->g = g; 255 p->next = pattern; /* convenience; we do not modify it */ 256 p->end = p->next + len; 257 p->error = 0; 258 p->ncsalloc = 0; 259 for (i = 0; i < NPAREN; i++) { 260 p->pbegin[i] = 0; 261 p->pend[i] = 0; 262 } 263 if (cflags & REG_EXTENDED) { 264 p->allowbranch = true; 265 p->bre = false; 266 p->parse_expr = p_ere_exp; 267 p->pre_parse = NULL; 268 p->post_parse = NULL; 269 } else { 270 p->allowbranch = false; 271 p->bre = true; 272 p->parse_expr = p_simp_re; 273 p->pre_parse = p_bre_pre_parse; 274 p->post_parse = p_bre_post_parse; 275 } 276 g->sets = NULL; 277 g->ncsets = 0; 278 g->cflags = cflags; 279 g->iflags = 0; 280 g->nbol = 0; 281 g->neol = 0; 282 g->must = NULL; 283 g->moffset = -1; 284 g->charjump = NULL; 285 g->matchjump = NULL; 286 g->mlen = 0; 287 g->nsub = 0; 288 g->backrefs = 0; 289 290 /* do it */ 291 EMIT(OEND, 0); 292 g->firststate = THERE(); 293 if (cflags & REG_NOSPEC) 294 p_str(p); 295 else 296 p_re(p, OUT, OUT); 297 EMIT(OEND, 0); 298 g->laststate = THERE(); 299 300 /* tidy up loose ends and fill things in */ 301 stripsnug(p, g); 302 findmust(p, g); 303 /* 304 * only use Boyer-Moore algorithm if the pattern is bigger 305 * than three characters 306 */ 307 if (g->mlen > 3) { 308 computejumps(p, g); 309 computematchjumps(p, g); 310 if (g->matchjump == NULL && g->charjump != NULL) { 311 free(g->charjump); 312 g->charjump = NULL; 313 } 314 } 315 g->nplus = pluscount(p, g); 316 g->magic = MAGIC2; 317 preg->re_nsub = g->nsub; 318 preg->re_g = g; 319 preg->re_magic = MAGIC1; 320 #ifndef REDEBUG 321 /* not debugging, so can't rely on the assert() in regexec() */ 322 if (g->iflags&BAD) 323 SETERROR(REG_EFATAL); 324 #endif 325 326 /* win or lose, we're done */ 327 if (p->error != 0) /* lose */ 328 regfree(preg); 329 return (p->error); 330 } 331 332 /* 333 * Parse one subERE, an atom possibly followed by a repetition op, 334 * return whether we should terminate or not. 335 */ 336 static bool 337 p_ere_exp(struct parse *p, struct branchc *bc) 338 { 339 char c; 340 wint_t wc; 341 sopno pos; 342 int count; 343 int count2; 344 sopno subno; 345 int wascaret = 0; 346 347 (void) bc; 348 assert(MORE()); /* caller should have ensured this */ 349 c = GETNEXT(); 350 351 pos = HERE(); 352 switch (c) { 353 case '(': 354 (void) REQUIRE(MORE(), REG_EPAREN); 355 p->g->nsub++; 356 subno = p->g->nsub; 357 if (subno < NPAREN) 358 p->pbegin[subno] = HERE(); 359 EMIT(OLPAREN, subno); 360 if (!SEE(')')) 361 p_re(p, ')', IGN); 362 if (subno < NPAREN) { 363 p->pend[subno] = HERE(); 364 assert(p->pend[subno] != 0); 365 } 366 EMIT(ORPAREN, subno); 367 (void) MUSTEAT(')', REG_EPAREN); 368 break; 369 #ifndef POSIX_MISTAKE 370 case ')': /* happens only if no current unmatched ( */ 371 /* 372 * You may ask, why the ifndef? Because I didn't notice 373 * this until slightly too late for 1003.2, and none of the 374 * other 1003.2 regular-expression reviewers noticed it at 375 * all. So an unmatched ) is legal POSIX, at least until 376 * we can get it fixed. 377 */ 378 SETERROR(REG_EPAREN); 379 break; 380 #endif 381 case '^': 382 EMIT(OBOL, 0); 383 p->g->iflags |= USEBOL; 384 p->g->nbol++; 385 wascaret = 1; 386 break; 387 case '$': 388 EMIT(OEOL, 0); 389 p->g->iflags |= USEEOL; 390 p->g->neol++; 391 break; 392 case '|': 393 SETERROR(REG_BADPAT); 394 break; 395 case '*': 396 case '+': 397 case '?': 398 case '{': 399 SETERROR(REG_BADRPT); 400 break; 401 case '.': 402 if (p->g->cflags®_NEWLINE) 403 nonnewline(p); 404 else 405 EMIT(OANY, 0); 406 break; 407 case '[': 408 p_bracket(p); 409 break; 410 case '\\': 411 (void) REQUIRE(MORE(), REG_EESCAPE); 412 wc = WGETNEXT(); 413 switch (wc) { 414 case '<': 415 EMIT(OBOW, 0); 416 break; 417 case '>': 418 EMIT(OEOW, 0); 419 break; 420 default: 421 ordinary(p, wc); 422 break; 423 } 424 break; 425 default: 426 if (p->error != 0) 427 return (false); 428 p->next--; 429 wc = WGETNEXT(); 430 ordinary(p, wc); 431 break; 432 } 433 434 if (!MORE()) 435 return (false); 436 c = PEEK(); 437 /* we call { a repetition if followed by a digit */ 438 if (!(c == '*' || c == '+' || c == '?' || c == '{')) 439 return (false); /* no repetition, we're done */ 440 else if (c == '{') 441 (void) REQUIRE(MORE2() && \ 442 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); 443 NEXT(); 444 445 (void) REQUIRE(!wascaret, REG_BADRPT); 446 switch (c) { 447 case '*': /* implemented as +? */ 448 /* this case does not require the (y|) trick, noKLUDGE */ 449 INSERT(OPLUS_, pos); 450 ASTERN(O_PLUS, pos); 451 INSERT(OQUEST_, pos); 452 ASTERN(O_QUEST, pos); 453 break; 454 case '+': 455 INSERT(OPLUS_, pos); 456 ASTERN(O_PLUS, pos); 457 break; 458 case '?': 459 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 460 INSERT(OCH_, pos); /* offset slightly wrong */ 461 ASTERN(OOR1, pos); /* this one's right */ 462 AHEAD(pos); /* fix the OCH_ */ 463 EMIT(OOR2, 0); /* offset very wrong... */ 464 AHEAD(THERE()); /* ...so fix it */ 465 ASTERN(O_CH, THERETHERE()); 466 break; 467 case '{': 468 count = p_count(p); 469 if (EAT(',')) { 470 if (isdigit((uch)PEEK())) { 471 count2 = p_count(p); 472 (void) REQUIRE(count <= count2, REG_BADBR); 473 } else /* single number with comma */ 474 count2 = INFINITY; 475 } else /* just a single number */ 476 count2 = count; 477 repeat(p, pos, count, count2); 478 if (!EAT('}')) { /* error heuristics */ 479 while (MORE() && PEEK() != '}') 480 NEXT(); 481 (void) REQUIRE(MORE(), REG_EBRACE); 482 SETERROR(REG_BADBR); 483 } 484 break; 485 } 486 487 if (!MORE()) 488 return (false); 489 c = PEEK(); 490 if (!(c == '*' || c == '+' || c == '?' || 491 (c == '{' && MORE2() && isdigit((uch)PEEK2())))) 492 return (false); 493 SETERROR(REG_BADRPT); 494 return (false); 495 } 496 497 /* 498 * p_str - string (no metacharacters) "parser" 499 */ 500 static void 501 p_str(struct parse *p) 502 { 503 (void) REQUIRE(MORE(), REG_BADPAT); 504 while (MORE()) 505 ordinary(p, WGETNEXT()); 506 } 507 508 /* 509 * Eat consecutive branch delimiters for the kind of expression that we are 510 * parsing, return the number of delimiters that we ate. 511 */ 512 static int 513 p_branch_eat_delim(struct parse *p, struct branchc *bc) 514 { 515 int nskip; 516 517 (void) bc; 518 nskip = 0; 519 while (EAT('|')) 520 ++nskip; 521 return (nskip); 522 } 523 524 /* 525 * Insert necessary branch book-keeping operations. This emits a 526 * bogus 'next' offset, since we still have more to parse 527 */ 528 static void 529 p_branch_ins_offset(struct parse *p, struct branchc *bc) 530 { 531 if (bc->nbranch == 0) { 532 INSERT(OCH_, bc->start); /* offset is wrong */ 533 bc->fwd = bc->start; 534 bc->back = bc->start; 535 } 536 537 ASTERN(OOR1, bc->back); 538 bc->back = THERE(); 539 AHEAD(bc->fwd); /* fix previous offset */ 540 bc->fwd = HERE(); 541 EMIT(OOR2, 0); /* offset is very wrong */ 542 ++bc->nbranch; 543 } 544 545 /* 546 * Fix the offset of the tail branch, if we actually had any branches. 547 * This is to correct the bogus placeholder offset that we use. 548 */ 549 static void 550 p_branch_fix_tail(struct parse *p, struct branchc *bc) 551 { 552 /* Fix bogus offset at the tail if we actually have branches */ 553 if (bc->nbranch > 0) { 554 AHEAD(bc->fwd); 555 ASTERN(O_CH, bc->back); 556 } 557 } 558 559 /* 560 * Signal to the parser that an empty branch has been encountered; this will, 561 * in the future, be used to allow for more permissive behavior with empty 562 * branches. The return value should indicate whether parsing may continue 563 * or not. 564 */ 565 static bool 566 p_branch_empty(struct parse *p, struct branchc *bc) 567 { 568 (void) bc; 569 SETERROR(REG_BADPAT); 570 return (false); 571 } 572 573 /* 574 * Take care of any branching requirements. This includes inserting the 575 * appropriate branching instructions as well as eating all of the branch 576 * delimiters until we either run out of pattern or need to parse more pattern. 577 */ 578 static bool 579 p_branch_do(struct parse *p, struct branchc *bc) 580 { 581 int ate = 0; 582 583 ate = p_branch_eat_delim(p, bc); 584 if (ate == 0) 585 return (false); 586 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) 587 /* 588 * Halt parsing only if we have an empty branch and 589 * p_branch_empty indicates that we must not continue. 590 * In the future, this will not necessarily be an error. 591 */ 592 return (false); 593 p_branch_ins_offset(p, bc); 594 595 return (true); 596 } 597 598 static void 599 p_bre_pre_parse(struct parse *p, struct branchc *bc) 600 { 601 (void) bc; 602 /* 603 * Does not move cleanly into expression parser because of 604 * ordinary interpration of * at the beginning position of 605 * an expression. 606 */ 607 if (EAT('^')) { 608 EMIT(OBOL, 0); 609 p->g->iflags |= USEBOL; 610 p->g->nbol++; 611 } 612 } 613 614 static void 615 p_bre_post_parse(struct parse *p, struct branchc *bc) 616 { 617 /* Expression is terminating due to EOL token */ 618 if (bc->terminate) { 619 DROP(1); 620 EMIT(OEOL, 0); 621 p->g->iflags |= USEEOL; 622 p->g->neol++; 623 } 624 } 625 626 /* 627 * Top level parser, concatenation and BRE anchoring. 628 * Giving end1 as OUT essentially eliminates the end1/end2 check. 629 * 630 * This implementation is a bit of a kludge, in that a trailing $ is first 631 * taken as an ordinary character and then revised to be an anchor. 632 * The amount of lookahead needed to avoid this kludge is excessive. 633 */ 634 static void 635 p_re(struct parse *p, 636 int end1, /* first terminating character */ 637 int end2) /* second terminating character; ignored for EREs */ 638 { 639 struct branchc bc; 640 641 bc.nbranch = 0; 642 if (end1 == OUT && end2 == OUT) 643 bc.outer = true; 644 else 645 bc.outer = false; 646 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) 647 for (;;) { 648 bc.start = HERE(); 649 bc.nchain = 0; 650 bc.terminate = false; 651 if (p->pre_parse != NULL) 652 p->pre_parse(p, &bc); 653 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && 654 !SEEEND()) { 655 bc.terminate = p->parse_expr(p, &bc); 656 ++bc.nchain; 657 } 658 if (p->post_parse != NULL) 659 p->post_parse(p, &bc); 660 (void) REQUIRE(HERE() != bc.start, REG_BADPAT); 661 if (!p->allowbranch) 662 break; 663 /* 664 * p_branch_do's return value indicates whether we should 665 * continue parsing or not. This is both for correctness and 666 * a slight optimization, because it will check if we've 667 * encountered an empty branch or the end of the string 668 * immediately following a branch delimiter. 669 */ 670 if (!p_branch_do(p, &bc)) 671 break; 672 } 673 #undef SEE_END 674 if (p->allowbranch) 675 p_branch_fix_tail(p, &bc); 676 assert(!MORE() || SEE(end1)); 677 } 678 679 /* 680 * p_simp_re - parse a simple RE, an atom possibly followed by a repetition 681 */ 682 static bool /* was the simple RE an unbackslashed $? */ 683 p_simp_re(struct parse *p, struct branchc *bc) 684 { 685 int c; 686 int count; 687 int count2; 688 sopno pos; 689 int i; 690 wint_t wc; 691 sopno subno; 692 #define BACKSL (1<<CHAR_BIT) 693 694 pos = HERE(); /* repetition op, if any, covers from here */ 695 696 assert(MORE()); /* caller should have ensured this */ 697 c = GETNEXT(); 698 if (c == '\\') { 699 (void) REQUIRE(MORE(), REG_EESCAPE); 700 c = BACKSL | GETNEXT(); 701 } 702 switch (c) { 703 case '.': 704 if (p->g->cflags®_NEWLINE) 705 nonnewline(p); 706 else 707 EMIT(OANY, 0); 708 break; 709 case '[': 710 p_bracket(p); 711 break; 712 case BACKSL|'<': 713 EMIT(OBOW, 0); 714 break; 715 case BACKSL|'>': 716 EMIT(OEOW, 0); 717 break; 718 case BACKSL|'{': 719 SETERROR(REG_BADRPT); 720 break; 721 case BACKSL|'(': 722 p->g->nsub++; 723 subno = p->g->nsub; 724 if (subno < NPAREN) 725 p->pbegin[subno] = HERE(); 726 EMIT(OLPAREN, subno); 727 /* the MORE here is an error heuristic */ 728 if (MORE() && !SEETWO('\\', ')')) 729 p_re(p, '\\', ')'); 730 if (subno < NPAREN) { 731 p->pend[subno] = HERE(); 732 assert(p->pend[subno] != 0); 733 } 734 EMIT(ORPAREN, subno); 735 (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 736 break; 737 case BACKSL|')': /* should not get here -- must be user */ 738 SETERROR(REG_EPAREN); 739 break; 740 case BACKSL|'1': 741 case BACKSL|'2': 742 case BACKSL|'3': 743 case BACKSL|'4': 744 case BACKSL|'5': 745 case BACKSL|'6': 746 case BACKSL|'7': 747 case BACKSL|'8': 748 case BACKSL|'9': 749 i = (c&~BACKSL) - '0'; 750 assert(i < NPAREN); 751 if (p->pend[i] != 0) { 752 assert(i <= p->g->nsub); 753 EMIT(OBACK_, i); 754 assert(p->pbegin[i] != 0); 755 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 756 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 757 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 758 EMIT(O_BACK, i); 759 } else 760 SETERROR(REG_ESUBREG); 761 p->g->backrefs = 1; 762 break; 763 case '*': 764 /* 765 * Ordinary if used as the first character beyond BOL anchor of 766 * a (sub-)expression, counts as a bad repetition operator if it 767 * appears otherwise. 768 */ 769 (void) REQUIRE(bc->nchain == 0, REG_BADRPT); 770 /* FALLTHROUGH */ 771 default: 772 if (p->error != 0) 773 return (false); /* Definitely not $... */ 774 p->next--; 775 wc = WGETNEXT(); 776 ordinary(p, wc); 777 break; 778 } 779 780 if (EAT('*')) { /* implemented as +? */ 781 /* this case does not require the (y|) trick, noKLUDGE */ 782 INSERT(OPLUS_, pos); 783 ASTERN(O_PLUS, pos); 784 INSERT(OQUEST_, pos); 785 ASTERN(O_QUEST, pos); 786 } else if (EATTWO('\\', '{')) { 787 count = p_count(p); 788 if (EAT(',')) { 789 if (MORE() && isdigit((uch)PEEK())) { 790 count2 = p_count(p); 791 (void) REQUIRE(count <= count2, REG_BADBR); 792 } else /* single number with comma */ 793 count2 = INFINITY; 794 } else /* just a single number */ 795 count2 = count; 796 repeat(p, pos, count, count2); 797 if (!EATTWO('\\', '}')) { /* error heuristics */ 798 while (MORE() && !SEETWO('\\', '}')) 799 NEXT(); 800 (void) REQUIRE(MORE(), REG_EBRACE); 801 SETERROR(REG_BADBR); 802 } 803 } else if (c == '$') /* $ (but not \$) ends it */ 804 return (true); 805 806 return (false); 807 } 808 809 /* 810 * p_count - parse a repetition count 811 */ 812 static int /* the value */ 813 p_count(struct parse *p) 814 { 815 int count = 0; 816 int ndigits = 0; 817 818 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 819 count = count*10 + (GETNEXT() - '0'); 820 ndigits++; 821 } 822 823 (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 824 return (count); 825 } 826 827 /* 828 * p_bracket - parse a bracketed character list 829 */ 830 static void 831 p_bracket(struct parse *p) 832 { 833 cset *cs; 834 wint_t ch; 835 836 /* Dept of Truly Sickening Special-Case Kludges */ 837 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 838 EMIT(OBOW, 0); 839 NEXTn(6); 840 return; 841 } 842 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 843 EMIT(OEOW, 0); 844 NEXTn(6); 845 return; 846 } 847 848 if ((cs = allocset(p)) == NULL) 849 return; 850 851 if (p->g->cflags®_ICASE) 852 cs->icase = 1; 853 if (EAT('^')) 854 cs->invert = 1; 855 if (EAT(']')) 856 CHadd(p, cs, ']'); 857 else if (EAT('-')) 858 CHadd(p, cs, '-'); 859 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 860 p_b_term(p, cs); 861 if (EAT('-')) 862 CHadd(p, cs, '-'); 863 (void) MUSTEAT(']', REG_EBRACK); 864 865 if (p->error != 0) /* don't mess things up further */ 866 return; 867 868 if (cs->invert && p->g->cflags®_NEWLINE) 869 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 870 871 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 872 ordinary(p, ch); 873 freeset(p, cs); 874 } else 875 EMIT(OANYOF, (int)(cs - p->g->sets)); 876 } 877 878 /* 879 * p_b_term - parse one term of a bracketed character list 880 */ 881 static void 882 p_b_term(struct parse *p, cset *cs) 883 { 884 char c; 885 wint_t start, finish; 886 wint_t i; 887 locale_t loc = uselocale(NULL); 888 889 /* classify what we've got */ 890 switch ((MORE()) ? PEEK() : '\0') { 891 case '[': 892 c = (MORE2()) ? PEEK2() : '\0'; 893 break; 894 case '-': 895 SETERROR(REG_ERANGE); 896 return; /* NOTE RETURN */ 897 default: 898 c = '\0'; 899 break; 900 } 901 902 switch (c) { 903 case ':': /* character class */ 904 NEXT2(); 905 (void) REQUIRE(MORE(), REG_EBRACK); 906 c = PEEK(); 907 (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE); 908 p_b_cclass(p, cs); 909 (void) REQUIRE(MORE(), REG_EBRACK); 910 (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 911 break; 912 case '=': /* equivalence class */ 913 NEXT2(); 914 (void) REQUIRE(MORE(), REG_EBRACK); 915 c = PEEK(); 916 (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 917 p_b_eclass(p, cs); 918 (void) REQUIRE(MORE(), REG_EBRACK); 919 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 920 break; 921 default: /* symbol, ordinary character, or range */ 922 start = p_b_symbol(p); 923 if (SEE('-') && MORE2() && PEEK2() != ']') { 924 /* range */ 925 NEXT(); 926 if (EAT('-')) 927 finish = '-'; 928 else 929 finish = p_b_symbol(p); 930 } else 931 finish = start; 932 if (start == finish) 933 CHadd(p, cs, start); 934 else { 935 if (loc->collate->lc_is_posix) { 936 (void) REQUIRE((uch)start <= (uch)finish, 937 REG_ERANGE); 938 CHaddrange(p, cs, start, finish); 939 } else { 940 (void) REQUIRE(_collate_range_cmp(start, 941 finish, loc) <= 0, REG_ERANGE); 942 for (i = 0; i <= UCHAR_MAX; i++) { 943 if (_collate_range_cmp(start, i, loc) 944 <= 0 && 945 _collate_range_cmp(i, finish, loc) 946 <= 0) 947 CHadd(p, cs, i); 948 } 949 } 950 } 951 break; 952 } 953 } 954 955 /* 956 * p_b_cclass - parse a character-class name and deal with it 957 */ 958 static void 959 p_b_cclass(struct parse *p, cset *cs) 960 { 961 const char *sp = p->next; 962 size_t len; 963 wctype_t wct; 964 char clname[16]; 965 966 while (MORE() && isalpha((uch)PEEK())) 967 NEXT(); 968 len = p->next - sp; 969 if (len >= sizeof (clname) - 1) { 970 SETERROR(REG_ECTYPE); 971 return; 972 } 973 (void) memcpy(clname, sp, len); 974 clname[len] = '\0'; 975 if ((wct = wctype(clname)) == 0) { 976 SETERROR(REG_ECTYPE); 977 return; 978 } 979 CHaddtype(p, cs, wct); 980 } 981 982 /* 983 * p_b_eclass - parse an equivalence-class name and deal with it 984 * 985 * This implementation is incomplete. xxx 986 */ 987 static void 988 p_b_eclass(struct parse *p, cset *cs) 989 { 990 wint_t c; 991 992 c = p_b_coll_elem(p, '='); 993 CHadd(p, cs, c); 994 } 995 996 /* 997 * p_b_symbol - parse a character or [..]ed multicharacter collating symbol 998 */ 999 static wint_t /* value of symbol */ 1000 p_b_symbol(struct parse *p) 1001 { 1002 wint_t value; 1003 1004 (void) REQUIRE(MORE(), REG_EBRACK); 1005 if (!EATTWO('[', '.')) 1006 return (WGETNEXT()); 1007 1008 /* collating symbol */ 1009 value = p_b_coll_elem(p, '.'); 1010 (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 1011 return (value); 1012 } 1013 1014 /* 1015 * p_b_coll_elem - parse a collating-element name and look it up 1016 */ 1017 static wint_t /* value of collating element */ 1018 p_b_coll_elem(struct parse *p, 1019 wint_t endc) /* name ended by endc,']' */ 1020 { 1021 const char *sp = p->next; 1022 struct cname *cp; 1023 mbstate_t mbs; 1024 wchar_t wc; 1025 size_t clen, len; 1026 1027 while (MORE() && !SEETWO(endc, ']')) 1028 NEXT(); 1029 if (!MORE()) { 1030 SETERROR(REG_EBRACK); 1031 return (0); 1032 } 1033 len = p->next - sp; 1034 for (cp = cnames; cp->name != NULL; cp++) 1035 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1036 return (cp->code); /* known name */ 1037 (void) memset(&mbs, 0, sizeof (mbs)); 1038 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 1039 return (wc); /* single character */ 1040 else if (clen == (size_t)-1 || clen == (size_t)-2) 1041 SETERROR(REG_ECHAR); 1042 else 1043 SETERROR(REG_ECOLLATE); /* neither */ 1044 return (0); 1045 } 1046 1047 /* 1048 * othercase - return the case counterpart of an alphabetic 1049 */ 1050 static wint_t /* if no counterpart, return ch */ 1051 othercase(wint_t ch) 1052 { 1053 assert(iswalpha(ch)); 1054 if (iswupper(ch)) 1055 return (towlower(ch)); 1056 else if (iswlower(ch)) 1057 return (towupper(ch)); 1058 else /* peculiar, but could happen */ 1059 return (ch); 1060 } 1061 1062 /* 1063 * bothcases - emit a dualcase version of a two-case character 1064 * 1065 * Boy, is this implementation ever a kludge... 1066 */ 1067 static void 1068 bothcases(struct parse *p, wint_t ch) 1069 { 1070 const char *oldnext = p->next; 1071 const char *oldend = p->end; 1072 char bracket[3 + MB_LEN_MAX]; 1073 size_t n; 1074 mbstate_t mbs; 1075 1076 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1077 p->next = bracket; 1078 (void) memset(&mbs, 0, sizeof (mbs)); 1079 n = wcrtomb(bracket, ch, &mbs); 1080 assert(n != (size_t)-1); 1081 bracket[n] = ']'; 1082 bracket[n + 1] = '\0'; 1083 p->end = bracket+n+1; 1084 p_bracket(p); 1085 assert(p->next == p->end); 1086 p->next = oldnext; 1087 p->end = oldend; 1088 } 1089 1090 /* 1091 * ordinary - emit an ordinary character 1092 */ 1093 static void 1094 ordinary(struct parse *p, wint_t ch) 1095 { 1096 cset *cs; 1097 1098 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 1099 bothcases(p, ch); 1100 else if ((ch & OPDMASK) == ch) 1101 EMIT(OCHAR, ch); 1102 else { 1103 /* 1104 * Kludge: character is too big to fit into an OCHAR operand. 1105 * Emit a singleton set. 1106 */ 1107 if ((cs = allocset(p)) == NULL) 1108 return; 1109 CHadd(p, cs, ch); 1110 EMIT(OANYOF, (int)(cs - p->g->sets)); 1111 } 1112 } 1113 1114 /* 1115 * nonnewline - emit REG_NEWLINE version of OANY 1116 * 1117 * Boy, is this implementation ever a kludge... 1118 */ 1119 static void 1120 nonnewline(struct parse *p) 1121 { 1122 const char *oldnext = p->next; 1123 const char *oldend = p->end; 1124 char bracket[4]; 1125 1126 p->next = bracket; 1127 p->end = bracket+3; 1128 bracket[0] = '^'; 1129 bracket[1] = '\n'; 1130 bracket[2] = ']'; 1131 bracket[3] = '\0'; 1132 p_bracket(p); 1133 assert(p->next == bracket+3); 1134 p->next = oldnext; 1135 p->end = oldend; 1136 } 1137 1138 /* 1139 * repeat - generate code for a bounded repetition, recursively if needed 1140 */ 1141 static void 1142 repeat(struct parse *p, 1143 sopno start, /* operand from here to end of strip */ 1144 int from, /* repeated from this number */ 1145 int to) /* to this number of times (maybe INFINITY) */ 1146 { 1147 sopno finish = HERE(); 1148 #define N 2 1149 #define INF 3 1150 #define REP(f, t) ((f)*8 + (t)) 1151 #define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1152 sopno copy; 1153 1154 if (p->error != 0) /* head off possible runaway recursion */ 1155 return; 1156 1157 assert(from <= to); 1158 1159 switch (REP(MAP(from), MAP(to))) { 1160 case REP(0, 0): /* must be user doing this */ 1161 DROP(finish-start); /* drop the operand */ 1162 break; 1163 case REP(0, 1): /* as x{1,1}? */ 1164 case REP(0, N): /* as x{1,n}? */ 1165 case REP(0, INF): /* as x{1,}? */ 1166 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1167 INSERT(OCH_, start); /* offset is wrong... */ 1168 repeat(p, start+1, 1, to); 1169 ASTERN(OOR1, start); 1170 AHEAD(start); /* ... fix it */ 1171 EMIT(OOR2, 0); 1172 AHEAD(THERE()); 1173 ASTERN(O_CH, THERETHERE()); 1174 break; 1175 case REP(1, 1): /* trivial case */ 1176 /* done */ 1177 break; 1178 case REP(1, N): /* as x?x{1,n-1} */ 1179 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1180 INSERT(OCH_, start); 1181 ASTERN(OOR1, start); 1182 AHEAD(start); 1183 EMIT(OOR2, 0); /* offset very wrong... */ 1184 AHEAD(THERE()); /* ...so fix it */ 1185 ASTERN(O_CH, THERETHERE()); 1186 copy = dupl(p, start+1, finish+1); 1187 assert(copy == finish+4); 1188 repeat(p, copy, 1, to-1); 1189 break; 1190 case REP(1, INF): /* as x+ */ 1191 INSERT(OPLUS_, start); 1192 ASTERN(O_PLUS, start); 1193 break; 1194 case REP(N, N): /* as xx{m-1,n-1} */ 1195 copy = dupl(p, start, finish); 1196 repeat(p, copy, from-1, to-1); 1197 break; 1198 case REP(N, INF): /* as xx{n-1,INF} */ 1199 copy = dupl(p, start, finish); 1200 repeat(p, copy, from-1, to); 1201 break; 1202 default: /* "can't happen" */ 1203 SETERROR(REG_EFATAL); /* just in case */ 1204 break; 1205 } 1206 } 1207 1208 /* 1209 * wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1210 * character from the parse struct, signals a REG_ILLSEQ error if the 1211 * character can't be converted. Returns the number of bytes consumed. 1212 */ 1213 static wint_t 1214 wgetnext(struct parse *p) 1215 { 1216 mbstate_t mbs; 1217 wchar_t wc; 1218 size_t n; 1219 1220 (void) memset(&mbs, 0, sizeof (mbs)); 1221 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1222 if (n == (size_t)-1 || n == (size_t)-2) { 1223 SETERROR(REG_ECHAR); 1224 return (0); 1225 } 1226 if (n == 0) 1227 n = 1; 1228 p->next += n; 1229 return (wc); 1230 } 1231 1232 /* 1233 * seterr - set an error condition 1234 */ 1235 static int /* useless but makes type checking happy */ 1236 seterr(struct parse *p, int e) 1237 { 1238 if (p->error == 0) /* keep earliest error condition */ 1239 p->error = e; 1240 p->next = nuls; /* try to bring things to a halt */ 1241 p->end = nuls; 1242 return (0); /* make the return value well-defined */ 1243 } 1244 1245 /* 1246 * allocset - allocate a set of characters for [] 1247 */ 1248 static cset * 1249 allocset(struct parse *p) 1250 { 1251 cset *cs, *ncs; 1252 1253 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs)); 1254 if (ncs == NULL) { 1255 SETERROR(REG_ESPACE); 1256 return (NULL); 1257 } 1258 p->g->sets = ncs; 1259 cs = &p->g->sets[p->g->ncsets++]; 1260 (void) memset(cs, 0, sizeof (*cs)); 1261 1262 return (cs); 1263 } 1264 1265 /* 1266 * freeset - free a now-unused set 1267 */ 1268 static void 1269 freeset(struct parse *p, cset *cs) 1270 { 1271 cset *top = &p->g->sets[p->g->ncsets]; 1272 1273 free(cs->wides); 1274 free(cs->ranges); 1275 free(cs->types); 1276 (void) memset(cs, 0, sizeof (*cs)); 1277 if (cs == top-1) /* recover only the easy case */ 1278 p->g->ncsets--; 1279 } 1280 1281 /* 1282 * singleton - Determine whether a set contains only one character, 1283 * returning it if so, otherwise returning OUT. 1284 */ 1285 static wint_t 1286 singleton(cset *cs) 1287 { 1288 wint_t i, s, n; 1289 1290 for (i = n = 0; i < NC; i++) 1291 if (CHIN(cs, i)) { 1292 n++; 1293 s = i; 1294 } 1295 if (n == 1) 1296 return (s); 1297 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1298 cs->icase == 0) 1299 return (cs->wides[0]); 1300 /* Don't bother handling the other cases. */ 1301 return (OUT); 1302 } 1303 1304 /* 1305 * CHadd - add character to character set. 1306 */ 1307 static void 1308 CHadd(struct parse *p, cset *cs, wint_t ch) 1309 { 1310 wint_t nch, *newwides; 1311 assert(ch >= 0); 1312 if (ch < NC) 1313 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1314 else { 1315 newwides = realloc(cs->wides, (cs->nwides + 1) * 1316 sizeof (*cs->wides)); 1317 if (newwides == NULL) { 1318 SETERROR(REG_ESPACE); 1319 return; 1320 } 1321 cs->wides = newwides; 1322 cs->wides[cs->nwides++] = ch; 1323 } 1324 if (cs->icase) { 1325 if ((nch = towlower(ch)) < NC) 1326 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1327 if ((nch = towupper(ch)) < NC) 1328 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1329 } 1330 } 1331 1332 /* 1333 * CHaddrange - add all characters in the range [min,max] to a character set. 1334 */ 1335 static void 1336 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1337 { 1338 crange *newranges; 1339 1340 for (; min < NC && min <= max; min++) 1341 CHadd(p, cs, min); 1342 if (min >= max) 1343 return; 1344 newranges = realloc(cs->ranges, (cs->nranges + 1) * 1345 sizeof (*cs->ranges)); 1346 if (newranges == NULL) { 1347 SETERROR(REG_ESPACE); 1348 return; 1349 } 1350 cs->ranges = newranges; 1351 cs->ranges[cs->nranges].min = min; 1352 cs->ranges[cs->nranges].max = max; 1353 cs->nranges++; 1354 } 1355 1356 /* 1357 * CHaddtype - add all characters of a certain type to a character set. 1358 */ 1359 static void 1360 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1361 { 1362 wint_t i; 1363 wctype_t *newtypes; 1364 1365 for (i = 0; i < NC; i++) 1366 if (iswctype(i, wct)) 1367 CHadd(p, cs, i); 1368 newtypes = realloc(cs->types, (cs->ntypes + 1) * 1369 sizeof (*cs->types)); 1370 if (newtypes == NULL) { 1371 SETERROR(REG_ESPACE); 1372 return; 1373 } 1374 cs->types = newtypes; 1375 cs->types[cs->ntypes++] = wct; 1376 } 1377 1378 /* 1379 * dupl - emit a duplicate of a bunch of sops 1380 */ 1381 static sopno /* start of duplicate */ 1382 dupl(struct parse *p, 1383 sopno start, /* from here */ 1384 sopno finish) /* to this less one */ 1385 { 1386 sopno ret = HERE(); 1387 sopno len = finish - start; 1388 1389 assert(finish >= start); 1390 if (len == 0) 1391 return (ret); 1392 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1393 return (ret); 1394 assert(p->ssize >= p->slen + len); 1395 (void) memcpy((char *)(p->strip + p->slen), 1396 (char *)(p->strip + start), (size_t)len*sizeof (sop)); 1397 p->slen += len; 1398 return (ret); 1399 } 1400 1401 /* 1402 * doemit - emit a strip operator 1403 * 1404 * It might seem better to implement this as a macro with a function as 1405 * hard-case backup, but it's just too big and messy unless there are 1406 * some changes to the data structures. Maybe later. 1407 */ 1408 static void 1409 doemit(struct parse *p, sop op, size_t opnd) 1410 { 1411 /* avoid making error situations worse */ 1412 if (p->error != 0) 1413 return; 1414 1415 /* deal with oversize operands ("can't happen", more or less) */ 1416 assert(opnd < 1<<OPSHIFT); 1417 1418 /* deal with undersized strip */ 1419 if (p->slen >= p->ssize) 1420 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1421 return; 1422 1423 /* finally, it's all reduced to the easy case */ 1424 p->strip[p->slen++] = SOP(op, opnd); 1425 } 1426 1427 /* 1428 * doinsert - insert a sop into the strip 1429 */ 1430 static void 1431 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1432 { 1433 sopno sn; 1434 sop s; 1435 int i; 1436 1437 /* avoid making error situations worse */ 1438 if (p->error != 0) 1439 return; 1440 1441 sn = HERE(); 1442 EMIT(op, opnd); /* do checks, ensure space */ 1443 assert(HERE() == sn+1); 1444 s = p->strip[sn]; 1445 1446 /* adjust paren pointers */ 1447 assert(pos > 0); 1448 for (i = 1; i < NPAREN; i++) { 1449 if (p->pbegin[i] >= pos) { 1450 p->pbegin[i]++; 1451 } 1452 if (p->pend[i] >= pos) { 1453 p->pend[i]++; 1454 } 1455 } 1456 1457 (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1458 (HERE()-pos-1)*sizeof (sop)); 1459 p->strip[pos] = s; 1460 } 1461 1462 /* 1463 * dofwd - complete a forward reference 1464 */ 1465 static void 1466 dofwd(struct parse *p, sopno pos, sop value) 1467 { 1468 /* avoid making error situations worse */ 1469 if (p->error != 0) 1470 return; 1471 1472 assert(value < 1<<OPSHIFT); 1473 p->strip[pos] = OP(p->strip[pos]) | value; 1474 } 1475 1476 /* 1477 * enlarge - enlarge the strip 1478 */ 1479 static int 1480 enlarge(struct parse *p, sopno size) 1481 { 1482 sop *sp; 1483 1484 if (p->ssize >= size) 1485 return (1); 1486 1487 sp = (sop *)realloc(p->strip, size*sizeof (sop)); 1488 if (sp == NULL) { 1489 SETERROR(REG_ESPACE); 1490 return (0); 1491 } 1492 p->strip = sp; 1493 p->ssize = size; 1494 return (1); 1495 } 1496 1497 /* 1498 * stripsnug - compact the strip 1499 */ 1500 static void 1501 stripsnug(struct parse *p, struct re_guts *g) 1502 { 1503 g->nstates = p->slen; 1504 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop)); 1505 if (g->strip == NULL) { 1506 SETERROR(REG_ESPACE); 1507 g->strip = p->strip; 1508 } 1509 } 1510 1511 /* 1512 * findmust - fill in must and mlen with longest mandatory literal string 1513 * 1514 * This algorithm could do fancy things like analyzing the operands of | 1515 * for common subsequences. Someday. This code is simple and finds most 1516 * of the interesting cases. 1517 * 1518 * Note that must and mlen got initialized during setup. 1519 */ 1520 static void 1521 findmust(struct parse *p, struct re_guts *g) 1522 { 1523 sop *scan; 1524 sop *start = NULL; 1525 sop *newstart = NULL; 1526 sopno newlen; 1527 sop s; 1528 char *cp; 1529 int offset; 1530 char buf[MB_LEN_MAX]; 1531 size_t clen; 1532 mbstate_t mbs; 1533 locale_t loc = uselocale(NULL); 1534 1535 /* avoid making error situations worse */ 1536 if (p->error != 0) 1537 return; 1538 1539 /* 1540 * It's not generally safe to do a ``char'' substring search on 1541 * multibyte character strings, but it's safe for at least 1542 * UTF-8 (see RFC 3629). 1543 */ 1544 if (MB_CUR_MAX > 1 && 1545 strcmp(loc->runelocale->__encoding, "UTF-8") != 0) 1546 return; 1547 1548 /* find the longest OCHAR sequence in strip */ 1549 newlen = 0; 1550 offset = 0; 1551 g->moffset = 0; 1552 scan = g->strip + 1; 1553 do { 1554 s = *scan++; 1555 switch (OP(s)) { 1556 case OCHAR: /* sequence member */ 1557 if (newlen == 0) { /* new sequence */ 1558 (void) memset(&mbs, 0, sizeof (mbs)); 1559 newstart = scan - 1; 1560 } 1561 clen = wcrtomb(buf, OPND(s), &mbs); 1562 if (clen == (size_t)-1) 1563 goto toohard; 1564 newlen += clen; 1565 break; 1566 case OPLUS_: /* things that don't break one */ 1567 case OLPAREN: 1568 case ORPAREN: 1569 break; 1570 case OQUEST_: /* things that must be skipped */ 1571 case OCH_: 1572 offset = altoffset(scan, offset); 1573 scan--; 1574 do { 1575 scan += OPND(s); 1576 s = *scan; 1577 /* assert() interferes w debug printouts */ 1578 if (OP(s) != (sop)O_QUEST && 1579 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) { 1580 g->iflags |= BAD; 1581 return; 1582 } 1583 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); 1584 /* FALLTHROUGH */ 1585 case OBOW: /* things that break a sequence */ 1586 case OEOW: 1587 case OBOL: 1588 case OEOL: 1589 case O_QUEST: 1590 case O_CH: 1591 case OEND: 1592 if (newlen > (sopno)g->mlen) { /* ends one */ 1593 start = newstart; 1594 g->mlen = newlen; 1595 if (offset > -1) { 1596 g->moffset += offset; 1597 offset = newlen; 1598 } else 1599 g->moffset = offset; 1600 } else { 1601 if (offset > -1) 1602 offset += newlen; 1603 } 1604 newlen = 0; 1605 break; 1606 case OANY: 1607 if (newlen > (sopno)g->mlen) { /* ends one */ 1608 start = newstart; 1609 g->mlen = newlen; 1610 if (offset > -1) { 1611 g->moffset += offset; 1612 offset = newlen; 1613 } else 1614 g->moffset = offset; 1615 } else { 1616 if (offset > -1) 1617 offset += newlen; 1618 } 1619 if (offset > -1) 1620 offset++; 1621 newlen = 0; 1622 break; 1623 case OANYOF: /* may or may not invalidate offset */ 1624 /* First, everything as OANY */ 1625 if (newlen > (sopno)g->mlen) { /* ends one */ 1626 start = newstart; 1627 g->mlen = newlen; 1628 if (offset > -1) { 1629 g->moffset += offset; 1630 offset = newlen; 1631 } else 1632 g->moffset = offset; 1633 } else { 1634 if (offset > -1) 1635 offset += newlen; 1636 } 1637 if (offset > -1) 1638 offset++; 1639 newlen = 0; 1640 break; 1641 toohard: 1642 default: 1643 /* 1644 * Anything here makes it impossible or too hard 1645 * to calculate the offset -- so we give up; 1646 * save the last known good offset, in case the 1647 * must sequence doesn't occur later. 1648 */ 1649 if (newlen > (sopno)g->mlen) { /* ends one */ 1650 start = newstart; 1651 g->mlen = newlen; 1652 if (offset > -1) 1653 g->moffset += offset; 1654 else 1655 g->moffset = offset; 1656 } 1657 offset = -1; 1658 newlen = 0; 1659 break; 1660 } 1661 } while (OP(s) != OEND); 1662 1663 if (g->mlen == 0) { /* there isn't one */ 1664 g->moffset = -1; 1665 return; 1666 } 1667 1668 /* turn it into a character string */ 1669 g->must = malloc((size_t)g->mlen + 1); 1670 if (g->must == NULL) { /* argh; just forget it */ 1671 g->mlen = 0; 1672 g->moffset = -1; 1673 return; 1674 } 1675 cp = g->must; 1676 scan = start; 1677 (void) memset(&mbs, 0, sizeof (mbs)); 1678 while (cp < g->must + g->mlen) { 1679 while (OP(s = *scan++) != OCHAR) 1680 continue; 1681 clen = wcrtomb(cp, OPND(s), &mbs); 1682 assert(clen != (size_t)-1); 1683 cp += clen; 1684 } 1685 assert(cp == g->must + g->mlen); 1686 *cp++ = '\0'; /* just on general principles */ 1687 } 1688 1689 /* 1690 * altoffset - choose biggest offset among multiple choices 1691 * 1692 * Compute, recursively if necessary, the largest offset among multiple 1693 * re paths. 1694 */ 1695 static int 1696 altoffset(sop *scan, int offset) 1697 { 1698 int largest; 1699 int try; 1700 sop s; 1701 1702 /* If we gave up already on offsets, return */ 1703 if (offset == -1) 1704 return (-1); 1705 1706 largest = 0; 1707 try = 0; 1708 s = *scan++; 1709 while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) { 1710 switch (OP(s)) { 1711 case OOR1: 1712 if (try > largest) 1713 largest = try; 1714 try = 0; 1715 break; 1716 case OQUEST_: 1717 case OCH_: 1718 try = altoffset(scan, try); 1719 if (try == -1) 1720 return (-1); 1721 scan--; 1722 do { 1723 scan += OPND(s); 1724 s = *scan; 1725 if (OP(s) != (sop)O_QUEST && 1726 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) 1727 return (-1); 1728 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); 1729 /* 1730 * We must skip to the next position, or we'll 1731 * leave altoffset() too early. 1732 */ 1733 scan++; 1734 break; 1735 case OANYOF: 1736 case OCHAR: 1737 case OANY: 1738 try++; 1739 /*FALLTHRU*/ 1740 case OBOW: 1741 case OEOW: 1742 case OLPAREN: 1743 case ORPAREN: 1744 case OOR2: 1745 break; 1746 default: 1747 try = -1; 1748 break; 1749 } 1750 if (try == -1) 1751 return (-1); 1752 s = *scan++; 1753 } 1754 1755 if (try > largest) 1756 largest = try; 1757 1758 return (largest+offset); 1759 } 1760 1761 /* 1762 * computejumps - compute char jumps for BM scan 1763 * 1764 * This algorithm assumes g->must exists and is has size greater than 1765 * zero. It's based on the algorithm found on Computer Algorithms by 1766 * Sara Baase. 1767 * 1768 * A char jump is the number of characters one needs to jump based on 1769 * the value of the character from the text that was mismatched. 1770 */ 1771 static void 1772 computejumps(struct parse *p, struct re_guts *g) 1773 { 1774 int ch; 1775 int mindex; 1776 1777 /* Avoid making errors worse */ 1778 if (p->error != 0) 1779 return; 1780 1781 g->charjump = (int *)malloc((NC_MAX + 1) * sizeof (int)); 1782 if (g->charjump == NULL) /* Not a fatal error */ 1783 return; 1784 /* Adjust for signed chars, if necessary */ 1785 g->charjump = &g->charjump[-(CHAR_MIN)]; 1786 1787 /* 1788 * If the character does not exist in the pattern, the jump 1789 * is equal to the number of characters in the pattern. 1790 */ 1791 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 1792 g->charjump[ch] = g->mlen; 1793 1794 /* 1795 * If the character does exist, compute the jump that would 1796 * take us to the last character in the pattern equal to it 1797 * (notice that we match right to left, so that last character 1798 * is the first one that would be matched). 1799 */ 1800 for (mindex = 0; mindex < g->mlen; mindex++) 1801 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 1802 } 1803 1804 /* 1805 * computematchjumps - compute match jumps for BM scan 1806 * 1807 * This algorithm assumes g->must exists and is has size greater than 1808 * zero. It's based on the algorithm found on Computer Algorithms by 1809 * Sara Baase. 1810 * 1811 * A match jump is the number of characters one needs to advance based 1812 * on the already-matched suffix. 1813 * Notice that all values here are minus (g->mlen-1), because of the way 1814 * the search algorithm works. 1815 */ 1816 static void 1817 computematchjumps(struct parse *p, struct re_guts *g) 1818 { 1819 int mindex; /* General "must" iterator */ 1820 int suffix; /* Keeps track of matching suffix */ 1821 int ssuffix; /* Keeps track of suffixes' suffix */ 1822 int *pmatches; 1823 /* 1824 * pmatches[k] points to the next i 1825 * such that i+1...mlen is a substring 1826 * of k+1...k+mlen-i-1 1827 */ 1828 1829 /* Avoid making errors worse */ 1830 if (p->error != 0) 1831 return; 1832 1833 pmatches = (int *)malloc(g->mlen * sizeof (unsigned int)); 1834 if (pmatches == NULL) { 1835 g->matchjump = NULL; 1836 return; 1837 } 1838 1839 g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int)); 1840 if (g->matchjump == NULL) { /* Not a fatal error */ 1841 free(pmatches); 1842 return; 1843 } 1844 1845 /* Set maximum possible jump for each character in the pattern */ 1846 for (mindex = 0; mindex < g->mlen; mindex++) 1847 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 1848 1849 /* Compute pmatches[] */ 1850 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 1851 mindex--, suffix--) { 1852 pmatches[mindex] = suffix; 1853 1854 /* 1855 * If a mismatch is found, interrupting the substring, 1856 * compute the matchjump for that position. If no 1857 * mismatch is found, then a text substring mismatched 1858 * against the suffix will also mismatch against the 1859 * substring. 1860 */ 1861 while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) { 1862 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1863 g->mlen - mindex - 1); 1864 suffix = pmatches[suffix]; 1865 } 1866 } 1867 1868 /* 1869 * Compute the matchjump up to the last substring found to jump 1870 * to the beginning of the largest must pattern prefix matching 1871 * it's own suffix. 1872 */ 1873 for (mindex = 0; mindex <= suffix; mindex++) 1874 g->matchjump[mindex] = MIN(g->matchjump[mindex], 1875 g->mlen + suffix - mindex); 1876 1877 ssuffix = pmatches[suffix]; 1878 while (suffix < g->mlen) { 1879 while (suffix <= ssuffix && suffix < g->mlen) { 1880 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1881 g->mlen + ssuffix - suffix); 1882 suffix++; 1883 } 1884 if (suffix < g->mlen) 1885 ssuffix = pmatches[ssuffix]; 1886 } 1887 1888 free(pmatches); 1889 } 1890 1891 /* 1892 * pluscount - count + nesting 1893 */ 1894 static sopno /* nesting depth */ 1895 pluscount(struct parse *p, struct re_guts *g) 1896 { 1897 sop *scan; 1898 sop s; 1899 sopno plusnest = 0; 1900 sopno maxnest = 0; 1901 1902 if (p->error != 0) 1903 return (0); /* there may not be an OEND */ 1904 1905 scan = g->strip + 1; 1906 do { 1907 s = *scan++; 1908 switch (OP(s)) { 1909 case OPLUS_: 1910 plusnest++; 1911 break; 1912 case O_PLUS: 1913 if (plusnest > maxnest) 1914 maxnest = plusnest; 1915 plusnest--; 1916 break; 1917 } 1918 } while (OP(s) != OEND); 1919 if (plusnest != 0) 1920 g->iflags |= BAD; 1921 return (maxnest); 1922 } 1923