1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Copyright (c) 2011 The FreeBSD Foundation 9 * All rights reserved. 10 * Portions of this software were developed by David Chisnall 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * Henry Spencer. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 41 */ 42 43 #if defined(LIBC_SCCS) && !defined(lint) 44 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 45 #endif /* LIBC_SCCS and not lint */ 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/types.h> 50 #include <stdio.h> 51 #include <string.h> 52 #include <ctype.h> 53 #include <limits.h> 54 #include <stdlib.h> 55 #include <regex.h> 56 #include <stdbool.h> 57 #include <wchar.h> 58 #include <wctype.h> 59 60 #ifndef LIBREGEX 61 #include "collate.h" 62 #endif 63 64 #include "utils.h" 65 #include "regex2.h" 66 67 #include "cname.h" 68 69 /* 70 * Branching context, used to keep track of branch state for all of the branch- 71 * aware functions. In addition to keeping track of branch positions for the 72 * p_branch_* functions, we use this to simplify some clumsiness in BREs for 73 * detection of whether ^ is acting as an anchor or being used erroneously and 74 * also for whether we're in a sub-expression or not. 75 */ 76 struct branchc { 77 sopno start; 78 sopno back; 79 sopno fwd; 80 81 int nbranch; 82 int nchain; 83 bool outer; 84 bool terminate; 85 }; 86 87 /* 88 * parse structure, passed up and down to avoid global variables and 89 * other clumsinesses 90 */ 91 struct parse { 92 const char *next; /* next character in RE */ 93 const char *end; /* end of string (-> NUL normally) */ 94 int error; /* has an error been seen? */ 95 sop *strip; /* malloced strip */ 96 sopno ssize; /* malloced strip size (allocated) */ 97 sopno slen; /* malloced strip length (used) */ 98 int ncsalloc; /* number of csets allocated */ 99 struct re_guts *g; 100 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 101 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 102 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 103 bool allowbranch; /* can this expression branch? */ 104 bool bre; /* convenience; is this a BRE? */ 105 bool (*parse_expr)(struct parse *, struct branchc *); 106 void (*pre_parse)(struct parse *, struct branchc *); 107 void (*post_parse)(struct parse *, struct branchc *); 108 }; 109 110 /* ========= begin header generated by ./mkh ========= */ 111 #ifdef __cplusplus 112 extern "C" { 113 #endif 114 115 /* === regcomp.c === */ 116 static bool p_ere_exp(struct parse *p, struct branchc *bc); 117 static void p_str(struct parse *p); 118 static int p_branch_eat_delim(struct parse *p, struct branchc *bc); 119 static void p_branch_ins_offset(struct parse *p, struct branchc *bc); 120 static void p_branch_fix_tail(struct parse *p, struct branchc *bc); 121 static bool p_branch_empty(struct parse *p, struct branchc *bc); 122 static bool p_branch_do(struct parse *p, struct branchc *bc); 123 static void p_bre_pre_parse(struct parse *p, struct branchc *bc); 124 static void p_bre_post_parse(struct parse *p, struct branchc *bc); 125 static void p_re(struct parse *p, int end1, int end2); 126 static bool p_simp_re(struct parse *p, struct branchc *bc); 127 static int p_count(struct parse *p); 128 static void p_bracket(struct parse *p); 129 static int p_range_cmp(wchar_t c1, wchar_t c2); 130 static void p_b_term(struct parse *p, cset *cs); 131 static void p_b_cclass(struct parse *p, cset *cs); 132 static void p_b_eclass(struct parse *p, cset *cs); 133 static wint_t p_b_symbol(struct parse *p); 134 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 135 static wint_t othercase(wint_t ch); 136 static void bothcases(struct parse *p, wint_t ch); 137 static void ordinary(struct parse *p, wint_t ch); 138 static void nonnewline(struct parse *p); 139 static void repeat(struct parse *p, sopno start, int from, int to); 140 static int seterr(struct parse *p, int e); 141 static cset *allocset(struct parse *p); 142 static void freeset(struct parse *p, cset *cs); 143 static void CHadd(struct parse *p, cset *cs, wint_t ch); 144 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 145 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 146 static wint_t singleton(cset *cs); 147 static sopno dupl(struct parse *p, sopno start, sopno finish); 148 static void doemit(struct parse *p, sop op, size_t opnd); 149 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 150 static void dofwd(struct parse *p, sopno pos, sop value); 151 static int enlarge(struct parse *p, sopno size); 152 static void stripsnug(struct parse *p, struct re_guts *g); 153 static void findmust(struct parse *p, struct re_guts *g); 154 static int altoffset(sop *scan, int offset); 155 static void computejumps(struct parse *p, struct re_guts *g); 156 static void computematchjumps(struct parse *p, struct re_guts *g); 157 static sopno pluscount(struct parse *p, struct re_guts *g); 158 static wint_t wgetnext(struct parse *p); 159 160 #ifdef __cplusplus 161 } 162 #endif 163 /* ========= end header generated by ./mkh ========= */ 164 165 static char nuls[10]; /* place to point scanner in event of error */ 166 167 /* 168 * macros for use with parse structure 169 * BEWARE: these know that the parse structure is named `p' !!! 170 */ 171 #define PEEK() (*p->next) 172 #define PEEK2() (*(p->next+1)) 173 #define MORE() (p->next < p->end) 174 #define MORE2() (p->next+1 < p->end) 175 #define SEE(c) (MORE() && PEEK() == (c)) 176 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 177 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) 178 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 179 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 180 #define NEXT() (p->next++) 181 #define NEXT2() (p->next += 2) 182 #define NEXTn(n) (p->next += (n)) 183 #define GETNEXT() (*p->next++) 184 #define WGETNEXT() wgetnext(p) 185 #define SETERROR(e) seterr(p, (e)) 186 #define REQUIRE(co, e) ((co) || SETERROR(e)) 187 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 188 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 189 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 190 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 191 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 192 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 193 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 194 #define HERE() (p->slen) 195 #define THERE() (p->slen - 1) 196 #define THERETHERE() (p->slen - 2) 197 #define DROP(n) (p->slen -= (n)) 198 199 /* Macro used by computejump()/computematchjump() */ 200 #define MIN(a,b) ((a)<(b)?(a):(b)) 201 202 /* 203 - regcomp - interface for parser and compilation 204 = extern int regcomp(regex_t *, const char *, int); 205 = #define REG_BASIC 0000 206 = #define REG_EXTENDED 0001 207 = #define REG_ICASE 0002 208 = #define REG_NOSUB 0004 209 = #define REG_NEWLINE 0010 210 = #define REG_NOSPEC 0020 211 = #define REG_PEND 0040 212 = #define REG_DUMP 0200 213 */ 214 int /* 0 success, otherwise REG_something */ 215 regcomp(regex_t * __restrict preg, 216 const char * __restrict pattern, 217 int cflags) 218 { 219 struct parse pa; 220 struct re_guts *g; 221 struct parse *p = &pa; 222 int i; 223 size_t len; 224 size_t maxlen; 225 #ifdef REDEBUG 226 # define GOODFLAGS(f) (f) 227 #else 228 # define GOODFLAGS(f) ((f)&~REG_DUMP) 229 #endif 230 231 cflags = GOODFLAGS(cflags); 232 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 233 return(REG_INVARG); 234 235 if (cflags®_PEND) { 236 if (preg->re_endp < pattern) 237 return(REG_INVARG); 238 len = preg->re_endp - pattern; 239 } else 240 len = strlen(pattern); 241 242 /* do the mallocs early so failure handling is easy */ 243 g = (struct re_guts *)malloc(sizeof(struct re_guts)); 244 if (g == NULL) 245 return(REG_ESPACE); 246 /* 247 * Limit the pattern space to avoid a 32-bit overflow on buffer 248 * extension. Also avoid any signed overflow in case of conversion 249 * so make the real limit based on a 31-bit overflow. 250 * 251 * Likely not applicable on 64-bit systems but handle the case 252 * generically (who are we to stop people from using ~715MB+ 253 * patterns?). 254 */ 255 maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3; 256 if (len >= maxlen) { 257 free((char *)g); 258 return(REG_ESPACE); 259 } 260 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 261 assert(p->ssize >= len); 262 263 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 264 p->slen = 0; 265 if (p->strip == NULL) { 266 free((char *)g); 267 return(REG_ESPACE); 268 } 269 270 /* set things up */ 271 p->g = g; 272 p->next = pattern; /* convenience; we do not modify it */ 273 p->end = p->next + len; 274 p->error = 0; 275 p->ncsalloc = 0; 276 for (i = 0; i < NPAREN; i++) { 277 p->pbegin[i] = 0; 278 p->pend[i] = 0; 279 } 280 if (cflags & REG_EXTENDED) { 281 p->allowbranch = true; 282 p->bre = false; 283 p->parse_expr = p_ere_exp; 284 p->pre_parse = NULL; 285 p->post_parse = NULL; 286 } else { 287 p->allowbranch = false; 288 p->bre = true; 289 p->parse_expr = p_simp_re; 290 p->pre_parse = p_bre_pre_parse; 291 p->post_parse = p_bre_post_parse; 292 } 293 g->sets = NULL; 294 g->ncsets = 0; 295 g->cflags = cflags; 296 g->iflags = 0; 297 g->nbol = 0; 298 g->neol = 0; 299 g->must = NULL; 300 g->moffset = -1; 301 g->charjump = NULL; 302 g->matchjump = NULL; 303 g->mlen = 0; 304 g->nsub = 0; 305 g->backrefs = 0; 306 307 /* do it */ 308 EMIT(OEND, 0); 309 g->firststate = THERE(); 310 if (cflags & REG_NOSPEC) 311 p_str(p); 312 else 313 p_re(p, OUT, OUT); 314 EMIT(OEND, 0); 315 g->laststate = THERE(); 316 317 /* tidy up loose ends and fill things in */ 318 stripsnug(p, g); 319 findmust(p, g); 320 /* only use Boyer-Moore algorithm if the pattern is bigger 321 * than three characters 322 */ 323 if(g->mlen > 3) { 324 computejumps(p, g); 325 computematchjumps(p, g); 326 if(g->matchjump == NULL && g->charjump != NULL) { 327 free(g->charjump); 328 g->charjump = NULL; 329 } 330 } 331 g->nplus = pluscount(p, g); 332 g->magic = MAGIC2; 333 preg->re_nsub = g->nsub; 334 preg->re_g = g; 335 preg->re_magic = MAGIC1; 336 #ifndef REDEBUG 337 /* not debugging, so can't rely on the assert() in regexec() */ 338 if (g->iflags&BAD) 339 SETERROR(REG_ASSERT); 340 #endif 341 342 /* win or lose, we're done */ 343 if (p->error != 0) /* lose */ 344 regfree(preg); 345 return(p->error); 346 } 347 348 /* 349 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, 350 - return whether we should terminate or not 351 == static bool p_ere_exp(struct parse *p); 352 */ 353 static bool 354 p_ere_exp(struct parse *p, struct branchc *bc) 355 { 356 char c; 357 wint_t wc; 358 sopno pos; 359 int count; 360 int count2; 361 sopno subno; 362 int wascaret = 0; 363 364 (void)bc; 365 assert(MORE()); /* caller should have ensured this */ 366 c = GETNEXT(); 367 368 pos = HERE(); 369 switch (c) { 370 case '(': 371 (void)REQUIRE(MORE(), REG_EPAREN); 372 p->g->nsub++; 373 subno = p->g->nsub; 374 if (subno < NPAREN) 375 p->pbegin[subno] = HERE(); 376 EMIT(OLPAREN, subno); 377 if (!SEE(')')) 378 p_re(p, ')', IGN); 379 if (subno < NPAREN) { 380 p->pend[subno] = HERE(); 381 assert(p->pend[subno] != 0); 382 } 383 EMIT(ORPAREN, subno); 384 (void)MUSTEAT(')', REG_EPAREN); 385 break; 386 #ifndef POSIX_MISTAKE 387 case ')': /* happens only if no current unmatched ( */ 388 /* 389 * You may ask, why the ifndef? Because I didn't notice 390 * this until slightly too late for 1003.2, and none of the 391 * other 1003.2 regular-expression reviewers noticed it at 392 * all. So an unmatched ) is legal POSIX, at least until 393 * we can get it fixed. 394 */ 395 SETERROR(REG_EPAREN); 396 break; 397 #endif 398 case '^': 399 EMIT(OBOL, 0); 400 p->g->iflags |= USEBOL; 401 p->g->nbol++; 402 wascaret = 1; 403 break; 404 case '$': 405 EMIT(OEOL, 0); 406 p->g->iflags |= USEEOL; 407 p->g->neol++; 408 break; 409 case '|': 410 SETERROR(REG_EMPTY); 411 break; 412 case '*': 413 case '+': 414 case '?': 415 case '{': 416 SETERROR(REG_BADRPT); 417 break; 418 case '.': 419 if (p->g->cflags®_NEWLINE) 420 nonnewline(p); 421 else 422 EMIT(OANY, 0); 423 break; 424 case '[': 425 p_bracket(p); 426 break; 427 case '\\': 428 (void)REQUIRE(MORE(), REG_EESCAPE); 429 wc = WGETNEXT(); 430 switch (wc) { 431 case '<': 432 EMIT(OBOW, 0); 433 break; 434 case '>': 435 EMIT(OEOW, 0); 436 break; 437 default: 438 ordinary(p, wc); 439 break; 440 } 441 break; 442 default: 443 if (p->error != 0) 444 return (false); 445 p->next--; 446 wc = WGETNEXT(); 447 ordinary(p, wc); 448 break; 449 } 450 451 if (!MORE()) 452 return (false); 453 c = PEEK(); 454 /* we call { a repetition if followed by a digit */ 455 if (!( c == '*' || c == '+' || c == '?' || c == '{')) 456 return (false); /* no repetition, we're done */ 457 else if (c == '{') 458 (void)REQUIRE(MORE2() && \ 459 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); 460 NEXT(); 461 462 (void)REQUIRE(!wascaret, REG_BADRPT); 463 switch (c) { 464 case '*': /* implemented as +? */ 465 /* this case does not require the (y|) trick, noKLUDGE */ 466 INSERT(OPLUS_, pos); 467 ASTERN(O_PLUS, pos); 468 INSERT(OQUEST_, pos); 469 ASTERN(O_QUEST, pos); 470 break; 471 case '+': 472 INSERT(OPLUS_, pos); 473 ASTERN(O_PLUS, pos); 474 break; 475 case '?': 476 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 477 INSERT(OCH_, pos); /* offset slightly wrong */ 478 ASTERN(OOR1, pos); /* this one's right */ 479 AHEAD(pos); /* fix the OCH_ */ 480 EMIT(OOR2, 0); /* offset very wrong... */ 481 AHEAD(THERE()); /* ...so fix it */ 482 ASTERN(O_CH, THERETHERE()); 483 break; 484 case '{': 485 count = p_count(p); 486 if (EAT(',')) { 487 if (isdigit((uch)PEEK())) { 488 count2 = p_count(p); 489 (void)REQUIRE(count <= count2, REG_BADBR); 490 } else /* single number with comma */ 491 count2 = INFINITY; 492 } else /* just a single number */ 493 count2 = count; 494 repeat(p, pos, count, count2); 495 if (!EAT('}')) { /* error heuristics */ 496 while (MORE() && PEEK() != '}') 497 NEXT(); 498 (void)REQUIRE(MORE(), REG_EBRACE); 499 SETERROR(REG_BADBR); 500 } 501 break; 502 } 503 504 if (!MORE()) 505 return (false); 506 c = PEEK(); 507 if (!( c == '*' || c == '+' || c == '?' || 508 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 509 return (false); 510 SETERROR(REG_BADRPT); 511 return (false); 512 } 513 514 /* 515 - p_str - string (no metacharacters) "parser" 516 == static void p_str(struct parse *p); 517 */ 518 static void 519 p_str(struct parse *p) 520 { 521 (void)REQUIRE(MORE(), REG_EMPTY); 522 while (MORE()) 523 ordinary(p, WGETNEXT()); 524 } 525 526 /* 527 * Eat consecutive branch delimiters for the kind of expression that we are 528 * parsing, return the number of delimiters that we ate. 529 */ 530 static int 531 p_branch_eat_delim(struct parse *p, struct branchc *bc) 532 { 533 int nskip; 534 535 (void)bc; 536 nskip = 0; 537 while (EAT('|')) 538 ++nskip; 539 return (nskip); 540 } 541 542 /* 543 * Insert necessary branch book-keeping operations. This emits a 544 * bogus 'next' offset, since we still have more to parse 545 */ 546 static void 547 p_branch_ins_offset(struct parse *p, struct branchc *bc) 548 { 549 550 if (bc->nbranch == 0) { 551 INSERT(OCH_, bc->start); /* offset is wrong */ 552 bc->fwd = bc->start; 553 bc->back = bc->start; 554 } 555 556 ASTERN(OOR1, bc->back); 557 bc->back = THERE(); 558 AHEAD(bc->fwd); /* fix previous offset */ 559 bc->fwd = HERE(); 560 EMIT(OOR2, 0); /* offset is very wrong */ 561 ++bc->nbranch; 562 } 563 564 /* 565 * Fix the offset of the tail branch, if we actually had any branches. 566 * This is to correct the bogus placeholder offset that we use. 567 */ 568 static void 569 p_branch_fix_tail(struct parse *p, struct branchc *bc) 570 { 571 572 /* Fix bogus offset at the tail if we actually have branches */ 573 if (bc->nbranch > 0) { 574 AHEAD(bc->fwd); 575 ASTERN(O_CH, bc->back); 576 } 577 } 578 579 /* 580 * Signal to the parser that an empty branch has been encountered; this will, 581 * in the future, be used to allow for more permissive behavior with empty 582 * branches. The return value should indicate whether parsing may continue 583 * or not. 584 */ 585 static bool 586 p_branch_empty(struct parse *p, struct branchc *bc) 587 { 588 589 (void)bc; 590 SETERROR(REG_EMPTY); 591 return (false); 592 } 593 594 /* 595 * Take care of any branching requirements. This includes inserting the 596 * appropriate branching instructions as well as eating all of the branch 597 * delimiters until we either run out of pattern or need to parse more pattern. 598 */ 599 static bool 600 p_branch_do(struct parse *p, struct branchc *bc) 601 { 602 int ate = 0; 603 604 ate = p_branch_eat_delim(p, bc); 605 if (ate == 0) 606 return (false); 607 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) 608 /* 609 * Halt parsing only if we have an empty branch and p_branch_empty 610 * indicates that we must not continue. In the future, this will not 611 * necessarily be an error. 612 */ 613 return (false); 614 p_branch_ins_offset(p, bc); 615 616 return (true); 617 } 618 619 static void 620 p_bre_pre_parse(struct parse *p, struct branchc *bc) 621 { 622 623 (void) bc; 624 /* 625 * Does not move cleanly into expression parser because of 626 * ordinary interpration of * at the beginning position of 627 * an expression. 628 */ 629 if (EAT('^')) { 630 EMIT(OBOL, 0); 631 p->g->iflags |= USEBOL; 632 p->g->nbol++; 633 } 634 } 635 636 static void 637 p_bre_post_parse(struct parse *p, struct branchc *bc) 638 { 639 640 /* Expression is terminating due to EOL token */ 641 if (bc->terminate) { 642 DROP(1); 643 EMIT(OEOL, 0); 644 p->g->iflags |= USEEOL; 645 p->g->neol++; 646 } 647 } 648 649 /* 650 - p_re - Top level parser, concatenation and BRE anchoring 651 == static void p_re(struct parse *p, int end1, int end2); 652 * Giving end1 as OUT essentially eliminates the end1/end2 check. 653 * 654 * This implementation is a bit of a kludge, in that a trailing $ is first 655 * taken as an ordinary character and then revised to be an anchor. 656 * The amount of lookahead needed to avoid this kludge is excessive. 657 */ 658 static void 659 p_re(struct parse *p, 660 int end1, /* first terminating character */ 661 int end2) /* second terminating character; ignored for EREs */ 662 { 663 struct branchc bc; 664 665 bc.nbranch = 0; 666 if (end1 == OUT && end2 == OUT) 667 bc.outer = true; 668 else 669 bc.outer = false; 670 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) 671 for (;;) { 672 bc.start = HERE(); 673 bc.nchain = 0; 674 bc.terminate = false; 675 if (p->pre_parse != NULL) 676 p->pre_parse(p, &bc); 677 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) { 678 bc.terminate = p->parse_expr(p, &bc); 679 ++bc.nchain; 680 } 681 if (p->post_parse != NULL) 682 p->post_parse(p, &bc); 683 (void) REQUIRE(HERE() != bc.start, REG_EMPTY); 684 if (!p->allowbranch) 685 break; 686 /* 687 * p_branch_do's return value indicates whether we should 688 * continue parsing or not. This is both for correctness and 689 * a slight optimization, because it will check if we've 690 * encountered an empty branch or the end of the string 691 * immediately following a branch delimiter. 692 */ 693 if (!p_branch_do(p, &bc)) 694 break; 695 } 696 #undef SEE_END 697 if (p->allowbranch) 698 p_branch_fix_tail(p, &bc); 699 assert(!MORE() || SEE(end1)); 700 } 701 702 /* 703 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 704 == static bool p_simp_re(struct parse *p, struct branchc *bc); 705 */ 706 static bool /* was the simple RE an unbackslashed $? */ 707 p_simp_re(struct parse *p, struct branchc *bc) 708 { 709 int c; 710 int count; 711 int count2; 712 sopno pos; 713 int i; 714 wint_t wc; 715 sopno subno; 716 # define BACKSL (1<<CHAR_BIT) 717 718 pos = HERE(); /* repetition op, if any, covers from here */ 719 720 assert(MORE()); /* caller should have ensured this */ 721 c = GETNEXT(); 722 if (c == '\\') { 723 (void)REQUIRE(MORE(), REG_EESCAPE); 724 c = BACKSL | GETNEXT(); 725 } 726 switch (c) { 727 case '.': 728 if (p->g->cflags®_NEWLINE) 729 nonnewline(p); 730 else 731 EMIT(OANY, 0); 732 break; 733 case '[': 734 p_bracket(p); 735 break; 736 case BACKSL|'<': 737 EMIT(OBOW, 0); 738 break; 739 case BACKSL|'>': 740 EMIT(OEOW, 0); 741 break; 742 case BACKSL|'{': 743 SETERROR(REG_BADRPT); 744 break; 745 case BACKSL|'(': 746 p->g->nsub++; 747 subno = p->g->nsub; 748 if (subno < NPAREN) 749 p->pbegin[subno] = HERE(); 750 EMIT(OLPAREN, subno); 751 /* the MORE here is an error heuristic */ 752 if (MORE() && !SEETWO('\\', ')')) 753 p_re(p, '\\', ')'); 754 if (subno < NPAREN) { 755 p->pend[subno] = HERE(); 756 assert(p->pend[subno] != 0); 757 } 758 EMIT(ORPAREN, subno); 759 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 760 break; 761 case BACKSL|')': /* should not get here -- must be user */ 762 SETERROR(REG_EPAREN); 763 break; 764 case BACKSL|'1': 765 case BACKSL|'2': 766 case BACKSL|'3': 767 case BACKSL|'4': 768 case BACKSL|'5': 769 case BACKSL|'6': 770 case BACKSL|'7': 771 case BACKSL|'8': 772 case BACKSL|'9': 773 i = (c&~BACKSL) - '0'; 774 assert(i < NPAREN); 775 if (p->pend[i] != 0) { 776 assert(i <= p->g->nsub); 777 EMIT(OBACK_, i); 778 assert(p->pbegin[i] != 0); 779 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 780 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 781 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 782 EMIT(O_BACK, i); 783 } else 784 SETERROR(REG_ESUBREG); 785 p->g->backrefs = 1; 786 break; 787 case '*': 788 /* 789 * Ordinary if used as the first character beyond BOL anchor of 790 * a (sub-)expression, counts as a bad repetition operator if it 791 * appears otherwise. 792 */ 793 (void)REQUIRE(bc->nchain == 0, REG_BADRPT); 794 /* FALLTHROUGH */ 795 default: 796 if (p->error != 0) 797 return (false); /* Definitely not $... */ 798 p->next--; 799 wc = WGETNEXT(); 800 ordinary(p, wc); 801 break; 802 } 803 804 if (EAT('*')) { /* implemented as +? */ 805 /* this case does not require the (y|) trick, noKLUDGE */ 806 INSERT(OPLUS_, pos); 807 ASTERN(O_PLUS, pos); 808 INSERT(OQUEST_, pos); 809 ASTERN(O_QUEST, pos); 810 } else if (EATTWO('\\', '{')) { 811 count = p_count(p); 812 if (EAT(',')) { 813 if (MORE() && isdigit((uch)PEEK())) { 814 count2 = p_count(p); 815 (void)REQUIRE(count <= count2, REG_BADBR); 816 } else /* single number with comma */ 817 count2 = INFINITY; 818 } else /* just a single number */ 819 count2 = count; 820 repeat(p, pos, count, count2); 821 if (!EATTWO('\\', '}')) { /* error heuristics */ 822 while (MORE() && !SEETWO('\\', '}')) 823 NEXT(); 824 (void)REQUIRE(MORE(), REG_EBRACE); 825 SETERROR(REG_BADBR); 826 } 827 } else if (c == '$') /* $ (but not \$) ends it */ 828 return (true); 829 830 return (false); 831 } 832 833 /* 834 - p_count - parse a repetition count 835 == static int p_count(struct parse *p); 836 */ 837 static int /* the value */ 838 p_count(struct parse *p) 839 { 840 int count = 0; 841 int ndigits = 0; 842 843 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 844 count = count*10 + (GETNEXT() - '0'); 845 ndigits++; 846 } 847 848 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 849 return(count); 850 } 851 852 /* 853 - p_bracket - parse a bracketed character list 854 == static void p_bracket(struct parse *p); 855 */ 856 static void 857 p_bracket(struct parse *p) 858 { 859 cset *cs; 860 wint_t ch; 861 862 /* Dept of Truly Sickening Special-Case Kludges */ 863 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 864 EMIT(OBOW, 0); 865 NEXTn(6); 866 return; 867 } 868 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 869 EMIT(OEOW, 0); 870 NEXTn(6); 871 return; 872 } 873 874 if ((cs = allocset(p)) == NULL) 875 return; 876 877 if (p->g->cflags®_ICASE) 878 cs->icase = 1; 879 if (EAT('^')) 880 cs->invert = 1; 881 if (EAT(']')) 882 CHadd(p, cs, ']'); 883 else if (EAT('-')) 884 CHadd(p, cs, '-'); 885 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 886 p_b_term(p, cs); 887 if (EAT('-')) 888 CHadd(p, cs, '-'); 889 (void)MUSTEAT(']', REG_EBRACK); 890 891 if (p->error != 0) /* don't mess things up further */ 892 return; 893 894 if (cs->invert && p->g->cflags®_NEWLINE) 895 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 896 897 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 898 ordinary(p, ch); 899 freeset(p, cs); 900 } else 901 EMIT(OANYOF, (int)(cs - p->g->sets)); 902 } 903 904 static int 905 p_range_cmp(wchar_t c1, wchar_t c2) 906 { 907 #ifndef LIBREGEX 908 return __wcollate_range_cmp(c1, c2); 909 #else 910 /* Copied from libc/collate __wcollate_range_cmp */ 911 wchar_t s1[2], s2[2]; 912 913 s1[0] = c1; 914 s1[1] = L'\0'; 915 s2[0] = c2; 916 s2[1] = L'\0'; 917 return (wcscoll(s1, s2)); 918 #endif 919 } 920 921 /* 922 - p_b_term - parse one term of a bracketed character list 923 == static void p_b_term(struct parse *p, cset *cs); 924 */ 925 static void 926 p_b_term(struct parse *p, cset *cs) 927 { 928 char c; 929 wint_t start, finish; 930 wint_t i; 931 #ifndef LIBREGEX 932 struct xlocale_collate *table = 933 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 934 #endif 935 /* classify what we've got */ 936 switch ((MORE()) ? PEEK() : '\0') { 937 case '[': 938 c = (MORE2()) ? PEEK2() : '\0'; 939 break; 940 case '-': 941 SETERROR(REG_ERANGE); 942 return; /* NOTE RETURN */ 943 default: 944 c = '\0'; 945 break; 946 } 947 948 switch (c) { 949 case ':': /* character class */ 950 NEXT2(); 951 (void)REQUIRE(MORE(), REG_EBRACK); 952 c = PEEK(); 953 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 954 p_b_cclass(p, cs); 955 (void)REQUIRE(MORE(), REG_EBRACK); 956 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 957 break; 958 case '=': /* equivalence class */ 959 NEXT2(); 960 (void)REQUIRE(MORE(), REG_EBRACK); 961 c = PEEK(); 962 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 963 p_b_eclass(p, cs); 964 (void)REQUIRE(MORE(), REG_EBRACK); 965 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 966 break; 967 default: /* symbol, ordinary character, or range */ 968 start = p_b_symbol(p); 969 if (SEE('-') && MORE2() && PEEK2() != ']') { 970 /* range */ 971 NEXT(); 972 if (EAT('-')) 973 finish = '-'; 974 else 975 finish = p_b_symbol(p); 976 } else 977 finish = start; 978 if (start == finish) 979 CHadd(p, cs, start); 980 else { 981 #ifndef LIBREGEX 982 if (table->__collate_load_error || MB_CUR_MAX > 1) { 983 #else 984 if (MB_CUR_MAX > 1) { 985 #endif 986 (void)REQUIRE(start <= finish, REG_ERANGE); 987 CHaddrange(p, cs, start, finish); 988 } else { 989 (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE); 990 for (i = 0; i <= UCHAR_MAX; i++) { 991 if (p_range_cmp(start, i) <= 0 && 992 p_range_cmp(i, finish) <= 0 ) 993 CHadd(p, cs, i); 994 } 995 } 996 } 997 break; 998 } 999 } 1000 1001 /* 1002 - p_b_cclass - parse a character-class name and deal with it 1003 == static void p_b_cclass(struct parse *p, cset *cs); 1004 */ 1005 static void 1006 p_b_cclass(struct parse *p, cset *cs) 1007 { 1008 const char *sp = p->next; 1009 size_t len; 1010 wctype_t wct; 1011 char clname[16]; 1012 1013 while (MORE() && isalpha((uch)PEEK())) 1014 NEXT(); 1015 len = p->next - sp; 1016 if (len >= sizeof(clname) - 1) { 1017 SETERROR(REG_ECTYPE); 1018 return; 1019 } 1020 memcpy(clname, sp, len); 1021 clname[len] = '\0'; 1022 if ((wct = wctype(clname)) == 0) { 1023 SETERROR(REG_ECTYPE); 1024 return; 1025 } 1026 CHaddtype(p, cs, wct); 1027 } 1028 1029 /* 1030 - p_b_eclass - parse an equivalence-class name and deal with it 1031 == static void p_b_eclass(struct parse *p, cset *cs); 1032 * 1033 * This implementation is incomplete. xxx 1034 */ 1035 static void 1036 p_b_eclass(struct parse *p, cset *cs) 1037 { 1038 wint_t c; 1039 1040 c = p_b_coll_elem(p, '='); 1041 CHadd(p, cs, c); 1042 } 1043 1044 /* 1045 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 1046 == static wint_t p_b_symbol(struct parse *p); 1047 */ 1048 static wint_t /* value of symbol */ 1049 p_b_symbol(struct parse *p) 1050 { 1051 wint_t value; 1052 1053 (void)REQUIRE(MORE(), REG_EBRACK); 1054 if (!EATTWO('[', '.')) 1055 return(WGETNEXT()); 1056 1057 /* collating symbol */ 1058 value = p_b_coll_elem(p, '.'); 1059 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 1060 return(value); 1061 } 1062 1063 /* 1064 - p_b_coll_elem - parse a collating-element name and look it up 1065 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 1066 */ 1067 static wint_t /* value of collating element */ 1068 p_b_coll_elem(struct parse *p, 1069 wint_t endc) /* name ended by endc,']' */ 1070 { 1071 const char *sp = p->next; 1072 struct cname *cp; 1073 mbstate_t mbs; 1074 wchar_t wc; 1075 size_t clen, len; 1076 1077 while (MORE() && !SEETWO(endc, ']')) 1078 NEXT(); 1079 if (!MORE()) { 1080 SETERROR(REG_EBRACK); 1081 return(0); 1082 } 1083 len = p->next - sp; 1084 for (cp = cnames; cp->name != NULL; cp++) 1085 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1086 return(cp->code); /* known name */ 1087 memset(&mbs, 0, sizeof(mbs)); 1088 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 1089 return (wc); /* single character */ 1090 else if (clen == (size_t)-1 || clen == (size_t)-2) 1091 SETERROR(REG_ILLSEQ); 1092 else 1093 SETERROR(REG_ECOLLATE); /* neither */ 1094 return(0); 1095 } 1096 1097 /* 1098 - othercase - return the case counterpart of an alphabetic 1099 == static wint_t othercase(wint_t ch); 1100 */ 1101 static wint_t /* if no counterpart, return ch */ 1102 othercase(wint_t ch) 1103 { 1104 assert(iswalpha(ch)); 1105 if (iswupper(ch)) 1106 return(towlower(ch)); 1107 else if (iswlower(ch)) 1108 return(towupper(ch)); 1109 else /* peculiar, but could happen */ 1110 return(ch); 1111 } 1112 1113 /* 1114 - bothcases - emit a dualcase version of a two-case character 1115 == static void bothcases(struct parse *p, wint_t ch); 1116 * 1117 * Boy, is this implementation ever a kludge... 1118 */ 1119 static void 1120 bothcases(struct parse *p, wint_t ch) 1121 { 1122 const char *oldnext = p->next; 1123 const char *oldend = p->end; 1124 char bracket[3 + MB_LEN_MAX]; 1125 size_t n; 1126 mbstate_t mbs; 1127 1128 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1129 p->next = bracket; 1130 memset(&mbs, 0, sizeof(mbs)); 1131 n = wcrtomb(bracket, ch, &mbs); 1132 assert(n != (size_t)-1); 1133 bracket[n] = ']'; 1134 bracket[n + 1] = '\0'; 1135 p->end = bracket+n+1; 1136 p_bracket(p); 1137 assert(p->next == p->end); 1138 p->next = oldnext; 1139 p->end = oldend; 1140 } 1141 1142 /* 1143 - ordinary - emit an ordinary character 1144 == static void ordinary(struct parse *p, wint_t ch); 1145 */ 1146 static void 1147 ordinary(struct parse *p, wint_t ch) 1148 { 1149 cset *cs; 1150 1151 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 1152 bothcases(p, ch); 1153 else if ((ch & OPDMASK) == ch) 1154 EMIT(OCHAR, ch); 1155 else { 1156 /* 1157 * Kludge: character is too big to fit into an OCHAR operand. 1158 * Emit a singleton set. 1159 */ 1160 if ((cs = allocset(p)) == NULL) 1161 return; 1162 CHadd(p, cs, ch); 1163 EMIT(OANYOF, (int)(cs - p->g->sets)); 1164 } 1165 } 1166 1167 /* 1168 - nonnewline - emit REG_NEWLINE version of OANY 1169 == static void nonnewline(struct parse *p); 1170 * 1171 * Boy, is this implementation ever a kludge... 1172 */ 1173 static void 1174 nonnewline(struct parse *p) 1175 { 1176 const char *oldnext = p->next; 1177 const char *oldend = p->end; 1178 char bracket[4]; 1179 1180 p->next = bracket; 1181 p->end = bracket+3; 1182 bracket[0] = '^'; 1183 bracket[1] = '\n'; 1184 bracket[2] = ']'; 1185 bracket[3] = '\0'; 1186 p_bracket(p); 1187 assert(p->next == bracket+3); 1188 p->next = oldnext; 1189 p->end = oldend; 1190 } 1191 1192 /* 1193 - repeat - generate code for a bounded repetition, recursively if needed 1194 == static void repeat(struct parse *p, sopno start, int from, int to); 1195 */ 1196 static void 1197 repeat(struct parse *p, 1198 sopno start, /* operand from here to end of strip */ 1199 int from, /* repeated from this number */ 1200 int to) /* to this number of times (maybe INFINITY) */ 1201 { 1202 sopno finish = HERE(); 1203 # define N 2 1204 # define INF 3 1205 # define REP(f, t) ((f)*8 + (t)) 1206 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1207 sopno copy; 1208 1209 if (p->error != 0) /* head off possible runaway recursion */ 1210 return; 1211 1212 assert(from <= to); 1213 1214 switch (REP(MAP(from), MAP(to))) { 1215 case REP(0, 0): /* must be user doing this */ 1216 DROP(finish-start); /* drop the operand */ 1217 break; 1218 case REP(0, 1): /* as x{1,1}? */ 1219 case REP(0, N): /* as x{1,n}? */ 1220 case REP(0, INF): /* as x{1,}? */ 1221 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1222 INSERT(OCH_, start); /* offset is wrong... */ 1223 repeat(p, start+1, 1, to); 1224 ASTERN(OOR1, start); 1225 AHEAD(start); /* ... fix it */ 1226 EMIT(OOR2, 0); 1227 AHEAD(THERE()); 1228 ASTERN(O_CH, THERETHERE()); 1229 break; 1230 case REP(1, 1): /* trivial case */ 1231 /* done */ 1232 break; 1233 case REP(1, N): /* as x?x{1,n-1} */ 1234 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1235 INSERT(OCH_, start); 1236 ASTERN(OOR1, start); 1237 AHEAD(start); 1238 EMIT(OOR2, 0); /* offset very wrong... */ 1239 AHEAD(THERE()); /* ...so fix it */ 1240 ASTERN(O_CH, THERETHERE()); 1241 copy = dupl(p, start+1, finish+1); 1242 assert(copy == finish+4); 1243 repeat(p, copy, 1, to-1); 1244 break; 1245 case REP(1, INF): /* as x+ */ 1246 INSERT(OPLUS_, start); 1247 ASTERN(O_PLUS, start); 1248 break; 1249 case REP(N, N): /* as xx{m-1,n-1} */ 1250 copy = dupl(p, start, finish); 1251 repeat(p, copy, from-1, to-1); 1252 break; 1253 case REP(N, INF): /* as xx{n-1,INF} */ 1254 copy = dupl(p, start, finish); 1255 repeat(p, copy, from-1, to); 1256 break; 1257 default: /* "can't happen" */ 1258 SETERROR(REG_ASSERT); /* just in case */ 1259 break; 1260 } 1261 } 1262 1263 /* 1264 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1265 - character from the parse struct, signals a REG_ILLSEQ error if the 1266 - character can't be converted. Returns the number of bytes consumed. 1267 */ 1268 static wint_t 1269 wgetnext(struct parse *p) 1270 { 1271 mbstate_t mbs; 1272 wchar_t wc; 1273 size_t n; 1274 1275 memset(&mbs, 0, sizeof(mbs)); 1276 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1277 if (n == (size_t)-1 || n == (size_t)-2) { 1278 SETERROR(REG_ILLSEQ); 1279 return (0); 1280 } 1281 if (n == 0) 1282 n = 1; 1283 p->next += n; 1284 return (wc); 1285 } 1286 1287 /* 1288 - seterr - set an error condition 1289 == static int seterr(struct parse *p, int e); 1290 */ 1291 static int /* useless but makes type checking happy */ 1292 seterr(struct parse *p, int e) 1293 { 1294 if (p->error == 0) /* keep earliest error condition */ 1295 p->error = e; 1296 p->next = nuls; /* try to bring things to a halt */ 1297 p->end = nuls; 1298 return(0); /* make the return value well-defined */ 1299 } 1300 1301 /* 1302 - allocset - allocate a set of characters for [] 1303 == static cset *allocset(struct parse *p); 1304 */ 1305 static cset * 1306 allocset(struct parse *p) 1307 { 1308 cset *cs, *ncs; 1309 1310 ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs)); 1311 if (ncs == NULL) { 1312 SETERROR(REG_ESPACE); 1313 return (NULL); 1314 } 1315 p->g->sets = ncs; 1316 cs = &p->g->sets[p->g->ncsets++]; 1317 memset(cs, 0, sizeof(*cs)); 1318 1319 return(cs); 1320 } 1321 1322 /* 1323 - freeset - free a now-unused set 1324 == static void freeset(struct parse *p, cset *cs); 1325 */ 1326 static void 1327 freeset(struct parse *p, cset *cs) 1328 { 1329 cset *top = &p->g->sets[p->g->ncsets]; 1330 1331 free(cs->wides); 1332 free(cs->ranges); 1333 free(cs->types); 1334 memset(cs, 0, sizeof(*cs)); 1335 if (cs == top-1) /* recover only the easy case */ 1336 p->g->ncsets--; 1337 } 1338 1339 /* 1340 - singleton - Determine whether a set contains only one character, 1341 - returning it if so, otherwise returning OUT. 1342 */ 1343 static wint_t 1344 singleton(cset *cs) 1345 { 1346 wint_t i, s, n; 1347 1348 for (i = n = 0; i < NC; i++) 1349 if (CHIN(cs, i)) { 1350 n++; 1351 s = i; 1352 } 1353 if (n == 1) 1354 return (s); 1355 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1356 cs->icase == 0) 1357 return (cs->wides[0]); 1358 /* Don't bother handling the other cases. */ 1359 return (OUT); 1360 } 1361 1362 /* 1363 - CHadd - add character to character set. 1364 */ 1365 static void 1366 CHadd(struct parse *p, cset *cs, wint_t ch) 1367 { 1368 wint_t nch, *newwides; 1369 assert(ch >= 0); 1370 if (ch < NC) 1371 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1372 else { 1373 newwides = reallocarray(cs->wides, cs->nwides + 1, 1374 sizeof(*cs->wides)); 1375 if (newwides == NULL) { 1376 SETERROR(REG_ESPACE); 1377 return; 1378 } 1379 cs->wides = newwides; 1380 cs->wides[cs->nwides++] = ch; 1381 } 1382 if (cs->icase) { 1383 if ((nch = towlower(ch)) < NC) 1384 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1385 if ((nch = towupper(ch)) < NC) 1386 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1387 } 1388 } 1389 1390 /* 1391 - CHaddrange - add all characters in the range [min,max] to a character set. 1392 */ 1393 static void 1394 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1395 { 1396 crange *newranges; 1397 1398 for (; min < NC && min <= max; min++) 1399 CHadd(p, cs, min); 1400 if (min >= max) 1401 return; 1402 newranges = reallocarray(cs->ranges, cs->nranges + 1, 1403 sizeof(*cs->ranges)); 1404 if (newranges == NULL) { 1405 SETERROR(REG_ESPACE); 1406 return; 1407 } 1408 cs->ranges = newranges; 1409 cs->ranges[cs->nranges].min = min; 1410 cs->ranges[cs->nranges].max = max; 1411 cs->nranges++; 1412 } 1413 1414 /* 1415 - CHaddtype - add all characters of a certain type to a character set. 1416 */ 1417 static void 1418 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1419 { 1420 wint_t i; 1421 wctype_t *newtypes; 1422 1423 for (i = 0; i < NC; i++) 1424 if (iswctype(i, wct)) 1425 CHadd(p, cs, i); 1426 newtypes = reallocarray(cs->types, cs->ntypes + 1, 1427 sizeof(*cs->types)); 1428 if (newtypes == NULL) { 1429 SETERROR(REG_ESPACE); 1430 return; 1431 } 1432 cs->types = newtypes; 1433 cs->types[cs->ntypes++] = wct; 1434 } 1435 1436 /* 1437 - dupl - emit a duplicate of a bunch of sops 1438 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1439 */ 1440 static sopno /* start of duplicate */ 1441 dupl(struct parse *p, 1442 sopno start, /* from here */ 1443 sopno finish) /* to this less one */ 1444 { 1445 sopno ret = HERE(); 1446 sopno len = finish - start; 1447 1448 assert(finish >= start); 1449 if (len == 0) 1450 return(ret); 1451 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1452 return(ret); 1453 (void) memcpy((char *)(p->strip + p->slen), 1454 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1455 p->slen += len; 1456 return(ret); 1457 } 1458 1459 /* 1460 - doemit - emit a strip operator 1461 == static void doemit(struct parse *p, sop op, size_t opnd); 1462 * 1463 * It might seem better to implement this as a macro with a function as 1464 * hard-case backup, but it's just too big and messy unless there are 1465 * some changes to the data structures. Maybe later. 1466 */ 1467 static void 1468 doemit(struct parse *p, sop op, size_t opnd) 1469 { 1470 /* avoid making error situations worse */ 1471 if (p->error != 0) 1472 return; 1473 1474 /* deal with oversize operands ("can't happen", more or less) */ 1475 assert(opnd < 1<<OPSHIFT); 1476 1477 /* deal with undersized strip */ 1478 if (p->slen >= p->ssize) 1479 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1480 return; 1481 1482 /* finally, it's all reduced to the easy case */ 1483 p->strip[p->slen++] = SOP(op, opnd); 1484 } 1485 1486 /* 1487 - doinsert - insert a sop into the strip 1488 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1489 */ 1490 static void 1491 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1492 { 1493 sopno sn; 1494 sop s; 1495 int i; 1496 1497 /* avoid making error situations worse */ 1498 if (p->error != 0) 1499 return; 1500 1501 sn = HERE(); 1502 EMIT(op, opnd); /* do checks, ensure space */ 1503 assert(HERE() == sn+1); 1504 s = p->strip[sn]; 1505 1506 /* adjust paren pointers */ 1507 assert(pos > 0); 1508 for (i = 1; i < NPAREN; i++) { 1509 if (p->pbegin[i] >= pos) { 1510 p->pbegin[i]++; 1511 } 1512 if (p->pend[i] >= pos) { 1513 p->pend[i]++; 1514 } 1515 } 1516 1517 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1518 (HERE()-pos-1)*sizeof(sop)); 1519 p->strip[pos] = s; 1520 } 1521 1522 /* 1523 - dofwd - complete a forward reference 1524 == static void dofwd(struct parse *p, sopno pos, sop value); 1525 */ 1526 static void 1527 dofwd(struct parse *p, sopno pos, sop value) 1528 { 1529 /* avoid making error situations worse */ 1530 if (p->error != 0) 1531 return; 1532 1533 assert(value < 1<<OPSHIFT); 1534 p->strip[pos] = OP(p->strip[pos]) | value; 1535 } 1536 1537 /* 1538 - enlarge - enlarge the strip 1539 == static int enlarge(struct parse *p, sopno size); 1540 */ 1541 static int 1542 enlarge(struct parse *p, sopno size) 1543 { 1544 sop *sp; 1545 1546 if (p->ssize >= size) 1547 return 1; 1548 1549 sp = reallocarray(p->strip, size, sizeof(sop)); 1550 if (sp == NULL) { 1551 SETERROR(REG_ESPACE); 1552 return 0; 1553 } 1554 p->strip = sp; 1555 p->ssize = size; 1556 return 1; 1557 } 1558 1559 /* 1560 - stripsnug - compact the strip 1561 == static void stripsnug(struct parse *p, struct re_guts *g); 1562 */ 1563 static void 1564 stripsnug(struct parse *p, struct re_guts *g) 1565 { 1566 g->nstates = p->slen; 1567 g->strip = reallocarray((char *)p->strip, p->slen, sizeof(sop)); 1568 if (g->strip == NULL) { 1569 SETERROR(REG_ESPACE); 1570 g->strip = p->strip; 1571 } 1572 } 1573 1574 /* 1575 - findmust - fill in must and mlen with longest mandatory literal string 1576 == static void findmust(struct parse *p, struct re_guts *g); 1577 * 1578 * This algorithm could do fancy things like analyzing the operands of | 1579 * for common subsequences. Someday. This code is simple and finds most 1580 * of the interesting cases. 1581 * 1582 * Note that must and mlen got initialized during setup. 1583 */ 1584 static void 1585 findmust(struct parse *p, struct re_guts *g) 1586 { 1587 sop *scan; 1588 sop *start = NULL; 1589 sop *newstart = NULL; 1590 sopno newlen; 1591 sop s; 1592 char *cp; 1593 int offset; 1594 char buf[MB_LEN_MAX]; 1595 size_t clen; 1596 mbstate_t mbs; 1597 1598 /* avoid making error situations worse */ 1599 if (p->error != 0) 1600 return; 1601 1602 /* 1603 * It's not generally safe to do a ``char'' substring search on 1604 * multibyte character strings, but it's safe for at least 1605 * UTF-8 (see RFC 3629). 1606 */ 1607 if (MB_CUR_MAX > 1 && 1608 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1609 return; 1610 1611 /* find the longest OCHAR sequence in strip */ 1612 newlen = 0; 1613 offset = 0; 1614 g->moffset = 0; 1615 scan = g->strip + 1; 1616 do { 1617 s = *scan++; 1618 switch (OP(s)) { 1619 case OCHAR: /* sequence member */ 1620 if (newlen == 0) { /* new sequence */ 1621 memset(&mbs, 0, sizeof(mbs)); 1622 newstart = scan - 1; 1623 } 1624 clen = wcrtomb(buf, OPND(s), &mbs); 1625 if (clen == (size_t)-1) 1626 goto toohard; 1627 newlen += clen; 1628 break; 1629 case OPLUS_: /* things that don't break one */ 1630 case OLPAREN: 1631 case ORPAREN: 1632 break; 1633 case OQUEST_: /* things that must be skipped */ 1634 case OCH_: 1635 offset = altoffset(scan, offset); 1636 scan--; 1637 do { 1638 scan += OPND(s); 1639 s = *scan; 1640 /* assert() interferes w debug printouts */ 1641 if (OP(s) != (sop)O_QUEST && 1642 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) { 1643 g->iflags |= BAD; 1644 return; 1645 } 1646 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); 1647 /* FALLTHROUGH */ 1648 case OBOW: /* things that break a sequence */ 1649 case OEOW: 1650 case OBOL: 1651 case OEOL: 1652 case O_QUEST: 1653 case O_CH: 1654 case OEND: 1655 if (newlen > (sopno)g->mlen) { /* ends one */ 1656 start = newstart; 1657 g->mlen = newlen; 1658 if (offset > -1) { 1659 g->moffset += offset; 1660 offset = newlen; 1661 } else 1662 g->moffset = offset; 1663 } else { 1664 if (offset > -1) 1665 offset += newlen; 1666 } 1667 newlen = 0; 1668 break; 1669 case OANY: 1670 if (newlen > (sopno)g->mlen) { /* ends one */ 1671 start = newstart; 1672 g->mlen = newlen; 1673 if (offset > -1) { 1674 g->moffset += offset; 1675 offset = newlen; 1676 } else 1677 g->moffset = offset; 1678 } else { 1679 if (offset > -1) 1680 offset += newlen; 1681 } 1682 if (offset > -1) 1683 offset++; 1684 newlen = 0; 1685 break; 1686 case OANYOF: /* may or may not invalidate offset */ 1687 /* First, everything as OANY */ 1688 if (newlen > (sopno)g->mlen) { /* ends one */ 1689 start = newstart; 1690 g->mlen = newlen; 1691 if (offset > -1) { 1692 g->moffset += offset; 1693 offset = newlen; 1694 } else 1695 g->moffset = offset; 1696 } else { 1697 if (offset > -1) 1698 offset += newlen; 1699 } 1700 if (offset > -1) 1701 offset++; 1702 newlen = 0; 1703 break; 1704 toohard: 1705 default: 1706 /* Anything here makes it impossible or too hard 1707 * to calculate the offset -- so we give up; 1708 * save the last known good offset, in case the 1709 * must sequence doesn't occur later. 1710 */ 1711 if (newlen > (sopno)g->mlen) { /* ends one */ 1712 start = newstart; 1713 g->mlen = newlen; 1714 if (offset > -1) 1715 g->moffset += offset; 1716 else 1717 g->moffset = offset; 1718 } 1719 offset = -1; 1720 newlen = 0; 1721 break; 1722 } 1723 } while (OP(s) != OEND); 1724 1725 if (g->mlen == 0) { /* there isn't one */ 1726 g->moffset = -1; 1727 return; 1728 } 1729 1730 /* turn it into a character string */ 1731 g->must = malloc((size_t)g->mlen + 1); 1732 if (g->must == NULL) { /* argh; just forget it */ 1733 g->mlen = 0; 1734 g->moffset = -1; 1735 return; 1736 } 1737 cp = g->must; 1738 scan = start; 1739 memset(&mbs, 0, sizeof(mbs)); 1740 while (cp < g->must + g->mlen) { 1741 while (OP(s = *scan++) != OCHAR) 1742 continue; 1743 clen = wcrtomb(cp, OPND(s), &mbs); 1744 assert(clen != (size_t)-1); 1745 cp += clen; 1746 } 1747 assert(cp == g->must + g->mlen); 1748 *cp++ = '\0'; /* just on general principles */ 1749 } 1750 1751 /* 1752 - altoffset - choose biggest offset among multiple choices 1753 == static int altoffset(sop *scan, int offset); 1754 * 1755 * Compute, recursively if necessary, the largest offset among multiple 1756 * re paths. 1757 */ 1758 static int 1759 altoffset(sop *scan, int offset) 1760 { 1761 int largest; 1762 int try; 1763 sop s; 1764 1765 /* If we gave up already on offsets, return */ 1766 if (offset == -1) 1767 return -1; 1768 1769 largest = 0; 1770 try = 0; 1771 s = *scan++; 1772 while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) { 1773 switch (OP(s)) { 1774 case OOR1: 1775 if (try > largest) 1776 largest = try; 1777 try = 0; 1778 break; 1779 case OQUEST_: 1780 case OCH_: 1781 try = altoffset(scan, try); 1782 if (try == -1) 1783 return -1; 1784 scan--; 1785 do { 1786 scan += OPND(s); 1787 s = *scan; 1788 if (OP(s) != (sop)O_QUEST && 1789 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) 1790 return -1; 1791 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); 1792 /* We must skip to the next position, or we'll 1793 * leave altoffset() too early. 1794 */ 1795 scan++; 1796 break; 1797 case OANYOF: 1798 case OCHAR: 1799 case OANY: 1800 try++; 1801 case OBOW: 1802 case OEOW: 1803 case OLPAREN: 1804 case ORPAREN: 1805 case OOR2: 1806 break; 1807 default: 1808 try = -1; 1809 break; 1810 } 1811 if (try == -1) 1812 return -1; 1813 s = *scan++; 1814 } 1815 1816 if (try > largest) 1817 largest = try; 1818 1819 return largest+offset; 1820 } 1821 1822 /* 1823 - computejumps - compute char jumps for BM scan 1824 == static void computejumps(struct parse *p, struct re_guts *g); 1825 * 1826 * This algorithm assumes g->must exists and is has size greater than 1827 * zero. It's based on the algorithm found on Computer Algorithms by 1828 * Sara Baase. 1829 * 1830 * A char jump is the number of characters one needs to jump based on 1831 * the value of the character from the text that was mismatched. 1832 */ 1833 static void 1834 computejumps(struct parse *p, struct re_guts *g) 1835 { 1836 int ch; 1837 int mindex; 1838 1839 /* Avoid making errors worse */ 1840 if (p->error != 0) 1841 return; 1842 1843 g->charjump = (int *)malloc((NC_MAX + 1) * sizeof(int)); 1844 if (g->charjump == NULL) /* Not a fatal error */ 1845 return; 1846 /* Adjust for signed chars, if necessary */ 1847 g->charjump = &g->charjump[-(CHAR_MIN)]; 1848 1849 /* If the character does not exist in the pattern, the jump 1850 * is equal to the number of characters in the pattern. 1851 */ 1852 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 1853 g->charjump[ch] = g->mlen; 1854 1855 /* If the character does exist, compute the jump that would 1856 * take us to the last character in the pattern equal to it 1857 * (notice that we match right to left, so that last character 1858 * is the first one that would be matched). 1859 */ 1860 for (mindex = 0; mindex < g->mlen; mindex++) 1861 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 1862 } 1863 1864 /* 1865 - computematchjumps - compute match jumps for BM scan 1866 == static void computematchjumps(struct parse *p, struct re_guts *g); 1867 * 1868 * This algorithm assumes g->must exists and is has size greater than 1869 * zero. It's based on the algorithm found on Computer Algorithms by 1870 * Sara Baase. 1871 * 1872 * A match jump is the number of characters one needs to advance based 1873 * on the already-matched suffix. 1874 * Notice that all values here are minus (g->mlen-1), because of the way 1875 * the search algorithm works. 1876 */ 1877 static void 1878 computematchjumps(struct parse *p, struct re_guts *g) 1879 { 1880 int mindex; /* General "must" iterator */ 1881 int suffix; /* Keeps track of matching suffix */ 1882 int ssuffix; /* Keeps track of suffixes' suffix */ 1883 int* pmatches; /* pmatches[k] points to the next i 1884 * such that i+1...mlen is a substring 1885 * of k+1...k+mlen-i-1 1886 */ 1887 1888 /* Avoid making errors worse */ 1889 if (p->error != 0) 1890 return; 1891 1892 pmatches = (int*) malloc(g->mlen * sizeof(int)); 1893 if (pmatches == NULL) { 1894 g->matchjump = NULL; 1895 return; 1896 } 1897 1898 g->matchjump = (int*) malloc(g->mlen * sizeof(int)); 1899 if (g->matchjump == NULL) { /* Not a fatal error */ 1900 free(pmatches); 1901 return; 1902 } 1903 1904 /* Set maximum possible jump for each character in the pattern */ 1905 for (mindex = 0; mindex < g->mlen; mindex++) 1906 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 1907 1908 /* Compute pmatches[] */ 1909 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 1910 mindex--, suffix--) { 1911 pmatches[mindex] = suffix; 1912 1913 /* If a mismatch is found, interrupting the substring, 1914 * compute the matchjump for that position. If no 1915 * mismatch is found, then a text substring mismatched 1916 * against the suffix will also mismatch against the 1917 * substring. 1918 */ 1919 while (suffix < g->mlen 1920 && g->must[mindex] != g->must[suffix]) { 1921 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1922 g->mlen - mindex - 1); 1923 suffix = pmatches[suffix]; 1924 } 1925 } 1926 1927 /* Compute the matchjump up to the last substring found to jump 1928 * to the beginning of the largest must pattern prefix matching 1929 * it's own suffix. 1930 */ 1931 for (mindex = 0; mindex <= suffix; mindex++) 1932 g->matchjump[mindex] = MIN(g->matchjump[mindex], 1933 g->mlen + suffix - mindex); 1934 1935 ssuffix = pmatches[suffix]; 1936 while (suffix < g->mlen) { 1937 while (suffix <= ssuffix && suffix < g->mlen) { 1938 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1939 g->mlen + ssuffix - suffix); 1940 suffix++; 1941 } 1942 if (suffix < g->mlen) 1943 ssuffix = pmatches[ssuffix]; 1944 } 1945 1946 free(pmatches); 1947 } 1948 1949 /* 1950 - pluscount - count + nesting 1951 == static sopno pluscount(struct parse *p, struct re_guts *g); 1952 */ 1953 static sopno /* nesting depth */ 1954 pluscount(struct parse *p, struct re_guts *g) 1955 { 1956 sop *scan; 1957 sop s; 1958 sopno plusnest = 0; 1959 sopno maxnest = 0; 1960 1961 if (p->error != 0) 1962 return(0); /* there may not be an OEND */ 1963 1964 scan = g->strip + 1; 1965 do { 1966 s = *scan++; 1967 switch (OP(s)) { 1968 case OPLUS_: 1969 plusnest++; 1970 break; 1971 case O_PLUS: 1972 if (plusnest > maxnest) 1973 maxnest = plusnest; 1974 plusnest--; 1975 break; 1976 } 1977 } while (OP(s) != OEND); 1978 if (plusnest != 0) 1979 g->iflags |= BAD; 1980 return(maxnest); 1981 } 1982