1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Copyright (c) 2011 The FreeBSD Foundation 9 * All rights reserved. 10 * Portions of this software were developed by David Chisnall 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * Henry Spencer. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 41 */ 42 43 #if defined(LIBC_SCCS) && !defined(lint) 44 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 45 #endif /* LIBC_SCCS and not lint */ 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/types.h> 50 #include <stdio.h> 51 #include <string.h> 52 #include <ctype.h> 53 #include <limits.h> 54 #include <stdlib.h> 55 #include <regex.h> 56 #include <stdbool.h> 57 #include <wchar.h> 58 #include <wctype.h> 59 60 #include "collate.h" 61 62 #include "utils.h" 63 #include "regex2.h" 64 65 #include "cname.h" 66 67 /* 68 * Branching context, used to keep track of branch state for all of the branch- 69 * aware functions. In addition to keeping track of branch positions for the 70 * p_branch_* functions, we use this to simplify some clumsiness in BREs for 71 * detection of whether ^ is acting as an anchor or being used erroneously and 72 * also for whether we're in a sub-expression or not. 73 */ 74 struct branchc { 75 sopno start; 76 sopno back; 77 sopno fwd; 78 79 int nbranch; 80 int nchain; 81 bool outer; 82 bool terminate; 83 }; 84 85 /* 86 * parse structure, passed up and down to avoid global variables and 87 * other clumsinesses 88 */ 89 struct parse { 90 const char *next; /* next character in RE */ 91 const char *end; /* end of string (-> NUL normally) */ 92 int error; /* has an error been seen? */ 93 sop *strip; /* malloced strip */ 94 sopno ssize; /* malloced strip size (allocated) */ 95 sopno slen; /* malloced strip length (used) */ 96 int ncsalloc; /* number of csets allocated */ 97 struct re_guts *g; 98 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 99 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 100 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 101 bool allowbranch; /* can this expression branch? */ 102 bool bre; /* convenience; is this a BRE? */ 103 bool (*parse_expr)(struct parse *, struct branchc *); 104 void (*pre_parse)(struct parse *, struct branchc *); 105 void (*post_parse)(struct parse *, struct branchc *); 106 }; 107 108 /* ========= begin header generated by ./mkh ========= */ 109 #ifdef __cplusplus 110 extern "C" { 111 #endif 112 113 /* === regcomp.c === */ 114 static bool p_ere_exp(struct parse *p, struct branchc *bc); 115 static void p_str(struct parse *p); 116 static int p_branch_eat_delim(struct parse *p, struct branchc *bc); 117 static void p_branch_ins_offset(struct parse *p, struct branchc *bc); 118 static void p_branch_fix_tail(struct parse *p, struct branchc *bc); 119 static bool p_branch_empty(struct parse *p, struct branchc *bc); 120 static bool p_branch_do(struct parse *p, struct branchc *bc); 121 static void p_bre_pre_parse(struct parse *p, struct branchc *bc); 122 static void p_bre_post_parse(struct parse *p, struct branchc *bc); 123 static void p_re(struct parse *p, int end1, int end2); 124 static bool p_simp_re(struct parse *p, struct branchc *bc); 125 static int p_count(struct parse *p); 126 static void p_bracket(struct parse *p); 127 static void p_b_term(struct parse *p, cset *cs); 128 static void p_b_cclass(struct parse *p, cset *cs); 129 static void p_b_eclass(struct parse *p, cset *cs); 130 static wint_t p_b_symbol(struct parse *p); 131 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 132 static wint_t othercase(wint_t ch); 133 static void bothcases(struct parse *p, wint_t ch); 134 static void ordinary(struct parse *p, wint_t ch); 135 static void nonnewline(struct parse *p); 136 static void repeat(struct parse *p, sopno start, int from, int to); 137 static int seterr(struct parse *p, int e); 138 static cset *allocset(struct parse *p); 139 static void freeset(struct parse *p, cset *cs); 140 static void CHadd(struct parse *p, cset *cs, wint_t ch); 141 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 142 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 143 static wint_t singleton(cset *cs); 144 static sopno dupl(struct parse *p, sopno start, sopno finish); 145 static void doemit(struct parse *p, sop op, size_t opnd); 146 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 147 static void dofwd(struct parse *p, sopno pos, sop value); 148 static int enlarge(struct parse *p, sopno size); 149 static void stripsnug(struct parse *p, struct re_guts *g); 150 static void findmust(struct parse *p, struct re_guts *g); 151 static int altoffset(sop *scan, int offset); 152 static void computejumps(struct parse *p, struct re_guts *g); 153 static void computematchjumps(struct parse *p, struct re_guts *g); 154 static sopno pluscount(struct parse *p, struct re_guts *g); 155 static wint_t wgetnext(struct parse *p); 156 157 #ifdef __cplusplus 158 } 159 #endif 160 /* ========= end header generated by ./mkh ========= */ 161 162 static char nuls[10]; /* place to point scanner in event of error */ 163 164 /* 165 * macros for use with parse structure 166 * BEWARE: these know that the parse structure is named `p' !!! 167 */ 168 #define PEEK() (*p->next) 169 #define PEEK2() (*(p->next+1)) 170 #define MORE() (p->next < p->end) 171 #define MORE2() (p->next+1 < p->end) 172 #define SEE(c) (MORE() && PEEK() == (c)) 173 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 174 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) 175 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 176 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 177 #define NEXT() (p->next++) 178 #define NEXT2() (p->next += 2) 179 #define NEXTn(n) (p->next += (n)) 180 #define GETNEXT() (*p->next++) 181 #define WGETNEXT() wgetnext(p) 182 #define SETERROR(e) seterr(p, (e)) 183 #define REQUIRE(co, e) ((co) || SETERROR(e)) 184 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 185 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 186 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 187 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 188 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 189 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 190 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 191 #define HERE() (p->slen) 192 #define THERE() (p->slen - 1) 193 #define THERETHERE() (p->slen - 2) 194 #define DROP(n) (p->slen -= (n)) 195 196 #ifndef NDEBUG 197 static int never = 0; /* for use in asserts; shuts lint up */ 198 #else 199 #define never 0 /* some <assert.h>s have bugs too */ 200 #endif 201 202 /* Macro used by computejump()/computematchjump() */ 203 #define MIN(a,b) ((a)<(b)?(a):(b)) 204 205 /* 206 - regcomp - interface for parser and compilation 207 = extern int regcomp(regex_t *, const char *, int); 208 = #define REG_BASIC 0000 209 = #define REG_EXTENDED 0001 210 = #define REG_ICASE 0002 211 = #define REG_NOSUB 0004 212 = #define REG_NEWLINE 0010 213 = #define REG_NOSPEC 0020 214 = #define REG_PEND 0040 215 = #define REG_DUMP 0200 216 */ 217 int /* 0 success, otherwise REG_something */ 218 regcomp(regex_t * __restrict preg, 219 const char * __restrict pattern, 220 int cflags) 221 { 222 struct parse pa; 223 struct re_guts *g; 224 struct parse *p = &pa; 225 int i; 226 size_t len; 227 size_t maxlen; 228 #ifdef REDEBUG 229 # define GOODFLAGS(f) (f) 230 #else 231 # define GOODFLAGS(f) ((f)&~REG_DUMP) 232 #endif 233 234 cflags = GOODFLAGS(cflags); 235 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 236 return(REG_INVARG); 237 238 if (cflags®_PEND) { 239 if (preg->re_endp < pattern) 240 return(REG_INVARG); 241 len = preg->re_endp - pattern; 242 } else 243 len = strlen(pattern); 244 245 /* do the mallocs early so failure handling is easy */ 246 g = (struct re_guts *)malloc(sizeof(struct re_guts)); 247 if (g == NULL) 248 return(REG_ESPACE); 249 /* 250 * Limit the pattern space to avoid a 32-bit overflow on buffer 251 * extension. Also avoid any signed overflow in case of conversion 252 * so make the real limit based on a 31-bit overflow. 253 * 254 * Likely not applicable on 64-bit systems but handle the case 255 * generically (who are we to stop people from using ~715MB+ 256 * patterns?). 257 */ 258 maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3; 259 if (len >= maxlen) { 260 free((char *)g); 261 return(REG_ESPACE); 262 } 263 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 264 assert(p->ssize >= len); 265 266 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 267 p->slen = 0; 268 if (p->strip == NULL) { 269 free((char *)g); 270 return(REG_ESPACE); 271 } 272 273 /* set things up */ 274 p->g = g; 275 p->next = pattern; /* convenience; we do not modify it */ 276 p->end = p->next + len; 277 p->error = 0; 278 p->ncsalloc = 0; 279 for (i = 0; i < NPAREN; i++) { 280 p->pbegin[i] = 0; 281 p->pend[i] = 0; 282 } 283 if (cflags & REG_EXTENDED) { 284 p->allowbranch = true; 285 p->bre = false; 286 p->parse_expr = p_ere_exp; 287 p->pre_parse = NULL; 288 p->post_parse = NULL; 289 } else { 290 p->allowbranch = false; 291 p->bre = true; 292 p->parse_expr = p_simp_re; 293 p->pre_parse = p_bre_pre_parse; 294 p->post_parse = p_bre_post_parse; 295 } 296 g->sets = NULL; 297 g->ncsets = 0; 298 g->cflags = cflags; 299 g->iflags = 0; 300 g->nbol = 0; 301 g->neol = 0; 302 g->must = NULL; 303 g->moffset = -1; 304 g->charjump = NULL; 305 g->matchjump = NULL; 306 g->mlen = 0; 307 g->nsub = 0; 308 g->backrefs = 0; 309 310 /* do it */ 311 EMIT(OEND, 0); 312 g->firststate = THERE(); 313 if (cflags & REG_NOSPEC) 314 p_str(p); 315 else 316 p_re(p, OUT, OUT); 317 EMIT(OEND, 0); 318 g->laststate = THERE(); 319 320 /* tidy up loose ends and fill things in */ 321 stripsnug(p, g); 322 findmust(p, g); 323 /* only use Boyer-Moore algorithm if the pattern is bigger 324 * than three characters 325 */ 326 if(g->mlen > 3) { 327 computejumps(p, g); 328 computematchjumps(p, g); 329 if(g->matchjump == NULL && g->charjump != NULL) { 330 free(g->charjump); 331 g->charjump = NULL; 332 } 333 } 334 g->nplus = pluscount(p, g); 335 g->magic = MAGIC2; 336 preg->re_nsub = g->nsub; 337 preg->re_g = g; 338 preg->re_magic = MAGIC1; 339 #ifndef REDEBUG 340 /* not debugging, so can't rely on the assert() in regexec() */ 341 if (g->iflags&BAD) 342 SETERROR(REG_ASSERT); 343 #endif 344 345 /* win or lose, we're done */ 346 if (p->error != 0) /* lose */ 347 regfree(preg); 348 return(p->error); 349 } 350 351 /* 352 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, 353 - return whether we should terminate or not 354 == static bool p_ere_exp(struct parse *p); 355 */ 356 static bool 357 p_ere_exp(struct parse *p, struct branchc *bc) 358 { 359 char c; 360 wint_t wc; 361 sopno pos; 362 int count; 363 int count2; 364 sopno subno; 365 int wascaret = 0; 366 367 assert(MORE()); /* caller should have ensured this */ 368 c = GETNEXT(); 369 370 pos = HERE(); 371 switch (c) { 372 case '(': 373 (void)REQUIRE(MORE(), REG_EPAREN); 374 p->g->nsub++; 375 subno = p->g->nsub; 376 if (subno < NPAREN) 377 p->pbegin[subno] = HERE(); 378 EMIT(OLPAREN, subno); 379 if (!SEE(')')) 380 p_re(p, ')', IGN); 381 if (subno < NPAREN) { 382 p->pend[subno] = HERE(); 383 assert(p->pend[subno] != 0); 384 } 385 EMIT(ORPAREN, subno); 386 (void)MUSTEAT(')', REG_EPAREN); 387 break; 388 #ifndef POSIX_MISTAKE 389 case ')': /* happens only if no current unmatched ( */ 390 /* 391 * You may ask, why the ifndef? Because I didn't notice 392 * this until slightly too late for 1003.2, and none of the 393 * other 1003.2 regular-expression reviewers noticed it at 394 * all. So an unmatched ) is legal POSIX, at least until 395 * we can get it fixed. 396 */ 397 SETERROR(REG_EPAREN); 398 break; 399 #endif 400 case '^': 401 EMIT(OBOL, 0); 402 p->g->iflags |= USEBOL; 403 p->g->nbol++; 404 wascaret = 1; 405 break; 406 case '$': 407 EMIT(OEOL, 0); 408 p->g->iflags |= USEEOL; 409 p->g->neol++; 410 break; 411 case '|': 412 SETERROR(REG_EMPTY); 413 break; 414 case '*': 415 case '+': 416 case '?': 417 case '{': 418 SETERROR(REG_BADRPT); 419 break; 420 case '.': 421 if (p->g->cflags®_NEWLINE) 422 nonnewline(p); 423 else 424 EMIT(OANY, 0); 425 break; 426 case '[': 427 p_bracket(p); 428 break; 429 case '\\': 430 (void)REQUIRE(MORE(), REG_EESCAPE); 431 wc = WGETNEXT(); 432 switch (wc) { 433 case '<': 434 EMIT(OBOW, 0); 435 break; 436 case '>': 437 EMIT(OEOW, 0); 438 break; 439 default: 440 ordinary(p, wc); 441 break; 442 } 443 break; 444 default: 445 if (p->error != 0) 446 return (false); 447 p->next--; 448 wc = WGETNEXT(); 449 ordinary(p, wc); 450 break; 451 } 452 453 if (!MORE()) 454 return (false); 455 c = PEEK(); 456 /* we call { a repetition if followed by a digit */ 457 if (!( c == '*' || c == '+' || c == '?' || c == '{')) 458 return (false); /* no repetition, we're done */ 459 else if (c == '{') 460 (void)REQUIRE(MORE2() && \ 461 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); 462 NEXT(); 463 464 (void)REQUIRE(!wascaret, REG_BADRPT); 465 switch (c) { 466 case '*': /* implemented as +? */ 467 /* this case does not require the (y|) trick, noKLUDGE */ 468 INSERT(OPLUS_, pos); 469 ASTERN(O_PLUS, pos); 470 INSERT(OQUEST_, pos); 471 ASTERN(O_QUEST, pos); 472 break; 473 case '+': 474 INSERT(OPLUS_, pos); 475 ASTERN(O_PLUS, pos); 476 break; 477 case '?': 478 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 479 INSERT(OCH_, pos); /* offset slightly wrong */ 480 ASTERN(OOR1, pos); /* this one's right */ 481 AHEAD(pos); /* fix the OCH_ */ 482 EMIT(OOR2, 0); /* offset very wrong... */ 483 AHEAD(THERE()); /* ...so fix it */ 484 ASTERN(O_CH, THERETHERE()); 485 break; 486 case '{': 487 count = p_count(p); 488 if (EAT(',')) { 489 if (isdigit((uch)PEEK())) { 490 count2 = p_count(p); 491 (void)REQUIRE(count <= count2, REG_BADBR); 492 } else /* single number with comma */ 493 count2 = INFINITY; 494 } else /* just a single number */ 495 count2 = count; 496 repeat(p, pos, count, count2); 497 if (!EAT('}')) { /* error heuristics */ 498 while (MORE() && PEEK() != '}') 499 NEXT(); 500 (void)REQUIRE(MORE(), REG_EBRACE); 501 SETERROR(REG_BADBR); 502 } 503 break; 504 } 505 506 if (!MORE()) 507 return (false); 508 c = PEEK(); 509 if (!( c == '*' || c == '+' || c == '?' || 510 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 511 return (false); 512 SETERROR(REG_BADRPT); 513 return (false); 514 } 515 516 /* 517 - p_str - string (no metacharacters) "parser" 518 == static void p_str(struct parse *p); 519 */ 520 static void 521 p_str(struct parse *p) 522 { 523 (void)REQUIRE(MORE(), REG_EMPTY); 524 while (MORE()) 525 ordinary(p, WGETNEXT()); 526 } 527 528 /* 529 * Eat consecutive branch delimiters for the kind of expression that we are 530 * parsing, return the number of delimiters that we ate. 531 */ 532 static int 533 p_branch_eat_delim(struct parse *p, struct branchc *bc) 534 { 535 int nskip; 536 537 nskip = 0; 538 while (EAT('|')) 539 ++nskip; 540 return (nskip); 541 } 542 543 /* 544 * Insert necessary branch book-keeping operations. This emits a 545 * bogus 'next' offset, since we still have more to parse 546 */ 547 static void 548 p_branch_ins_offset(struct parse *p, struct branchc *bc) 549 { 550 551 if (bc->nbranch == 0) { 552 INSERT(OCH_, bc->start); /* offset is wrong */ 553 bc->fwd = bc->start; 554 bc->back = bc->start; 555 } 556 557 ASTERN(OOR1, bc->back); 558 bc->back = THERE(); 559 AHEAD(bc->fwd); /* fix previous offset */ 560 bc->fwd = HERE(); 561 EMIT(OOR2, 0); /* offset is very wrong */ 562 ++bc->nbranch; 563 } 564 565 /* 566 * Fix the offset of the tail branch, if we actually had any branches. 567 * This is to correct the bogus placeholder offset that we use. 568 */ 569 static void 570 p_branch_fix_tail(struct parse *p, struct branchc *bc) 571 { 572 573 /* Fix bogus offset at the tail if we actually have branches */ 574 if (bc->nbranch > 0) { 575 AHEAD(bc->fwd); 576 ASTERN(O_CH, bc->back); 577 } 578 } 579 580 /* 581 * Signal to the parser that an empty branch has been encountered; this will, 582 * in the future, be used to allow for more permissive behavior with empty 583 * branches. The return value should indicate whether parsing may continue 584 * or not. 585 */ 586 static bool 587 p_branch_empty(struct parse *p, struct branchc *bc) 588 { 589 590 SETERROR(REG_EMPTY); 591 return (false); 592 } 593 594 /* 595 * Take care of any branching requirements. This includes inserting the 596 * appropriate branching instructions as well as eating all of the branch 597 * delimiters until we either run out of pattern or need to parse more pattern. 598 */ 599 static bool 600 p_branch_do(struct parse *p, struct branchc *bc) 601 { 602 int ate = 0; 603 604 ate = p_branch_eat_delim(p, bc); 605 if (ate == 0) 606 return (false); 607 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) 608 /* 609 * Halt parsing only if we have an empty branch and p_branch_empty 610 * indicates that we must not continue. In the future, this will not 611 * necessarily be an error. 612 */ 613 return (false); 614 p_branch_ins_offset(p, bc); 615 616 return (true); 617 } 618 619 static void 620 p_bre_pre_parse(struct parse *p, struct branchc *bc) 621 { 622 623 (void) bc; 624 /* 625 * Does not move cleanly into expression parser because of 626 * ordinary interpration of * at the beginning position of 627 * an expression. 628 */ 629 if (EAT('^')) { 630 EMIT(OBOL, 0); 631 p->g->iflags |= USEBOL; 632 p->g->nbol++; 633 } 634 } 635 636 static void 637 p_bre_post_parse(struct parse *p, struct branchc *bc) 638 { 639 640 /* Expression is terminating due to EOL token */ 641 if (bc->terminate) { 642 DROP(1); 643 EMIT(OEOL, 0); 644 p->g->iflags |= USEEOL; 645 p->g->neol++; 646 } 647 } 648 649 /* 650 - p_re - Top level parser, concatenation and BRE anchoring 651 == static void p_re(struct parse *p, int end1, int end2); 652 * Giving end1 as OUT essentially eliminates the end1/end2 check. 653 * 654 * This implementation is a bit of a kludge, in that a trailing $ is first 655 * taken as an ordinary character and then revised to be an anchor. 656 * The amount of lookahead needed to avoid this kludge is excessive. 657 */ 658 static void 659 p_re(struct parse *p, 660 int end1, /* first terminating character */ 661 int end2) /* second terminating character; ignored for EREs */ 662 { 663 struct branchc bc; 664 665 bc.nbranch = 0; 666 if (end1 == OUT && end2 == OUT) 667 bc.outer = true; 668 else 669 bc.outer = false; 670 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) 671 for (;;) { 672 bc.start = HERE(); 673 bc.nchain = 0; 674 bc.terminate = false; 675 if (p->pre_parse != NULL) 676 p->pre_parse(p, &bc); 677 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) { 678 bc.terminate = p->parse_expr(p, &bc); 679 ++bc.nchain; 680 } 681 if (p->post_parse != NULL) 682 p->post_parse(p, &bc); 683 (void) REQUIRE(HERE() != bc.start, REG_EMPTY); 684 if (!p->allowbranch) 685 break; 686 /* 687 * p_branch_do's return value indicates whether we should 688 * continue parsing or not. This is both for correctness and 689 * a slight optimization, because it will check if we've 690 * encountered an empty branch or the end of the string 691 * immediately following a branch delimiter. 692 */ 693 if (!p_branch_do(p, &bc)) 694 break; 695 } 696 #undef SEE_END 697 if (p->allowbranch) 698 p_branch_fix_tail(p, &bc); 699 assert(!MORE() || SEE(end1)); 700 } 701 702 /* 703 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 704 == static bool p_simp_re(struct parse *p, struct branchc *bc); 705 */ 706 static bool /* was the simple RE an unbackslashed $? */ 707 p_simp_re(struct parse *p, struct branchc *bc) 708 { 709 int c; 710 int count; 711 int count2; 712 sopno pos; 713 int i; 714 wint_t wc; 715 sopno subno; 716 # define BACKSL (1<<CHAR_BIT) 717 718 pos = HERE(); /* repetition op, if any, covers from here */ 719 720 assert(MORE()); /* caller should have ensured this */ 721 c = GETNEXT(); 722 if (c == '\\') { 723 (void)REQUIRE(MORE(), REG_EESCAPE); 724 c = BACKSL | GETNEXT(); 725 } 726 switch (c) { 727 case '.': 728 if (p->g->cflags®_NEWLINE) 729 nonnewline(p); 730 else 731 EMIT(OANY, 0); 732 break; 733 case '[': 734 p_bracket(p); 735 break; 736 case BACKSL|'<': 737 EMIT(OBOW, 0); 738 break; 739 case BACKSL|'>': 740 EMIT(OEOW, 0); 741 break; 742 case BACKSL|'{': 743 SETERROR(REG_BADRPT); 744 break; 745 case BACKSL|'(': 746 p->g->nsub++; 747 subno = p->g->nsub; 748 if (subno < NPAREN) 749 p->pbegin[subno] = HERE(); 750 EMIT(OLPAREN, subno); 751 /* the MORE here is an error heuristic */ 752 if (MORE() && !SEETWO('\\', ')')) 753 p_re(p, '\\', ')'); 754 if (subno < NPAREN) { 755 p->pend[subno] = HERE(); 756 assert(p->pend[subno] != 0); 757 } 758 EMIT(ORPAREN, subno); 759 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 760 break; 761 case BACKSL|')': /* should not get here -- must be user */ 762 SETERROR(REG_EPAREN); 763 break; 764 case BACKSL|'1': 765 case BACKSL|'2': 766 case BACKSL|'3': 767 case BACKSL|'4': 768 case BACKSL|'5': 769 case BACKSL|'6': 770 case BACKSL|'7': 771 case BACKSL|'8': 772 case BACKSL|'9': 773 i = (c&~BACKSL) - '0'; 774 assert(i < NPAREN); 775 if (p->pend[i] != 0) { 776 assert(i <= p->g->nsub); 777 EMIT(OBACK_, i); 778 assert(p->pbegin[i] != 0); 779 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 780 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 781 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 782 EMIT(O_BACK, i); 783 } else 784 SETERROR(REG_ESUBREG); 785 p->g->backrefs = 1; 786 break; 787 case '*': 788 /* 789 * Ordinary if used as the first character beyond BOL anchor of 790 * a (sub-)expression, counts as a bad repetition operator if it 791 * appears otherwise. 792 */ 793 (void)REQUIRE(bc->nchain == 0, REG_BADRPT); 794 /* FALLTHROUGH */ 795 default: 796 if (p->error != 0) 797 return (false); /* Definitely not $... */ 798 p->next--; 799 wc = WGETNEXT(); 800 ordinary(p, wc); 801 break; 802 } 803 804 if (EAT('*')) { /* implemented as +? */ 805 /* this case does not require the (y|) trick, noKLUDGE */ 806 INSERT(OPLUS_, pos); 807 ASTERN(O_PLUS, pos); 808 INSERT(OQUEST_, pos); 809 ASTERN(O_QUEST, pos); 810 } else if (EATTWO('\\', '{')) { 811 count = p_count(p); 812 if (EAT(',')) { 813 if (MORE() && isdigit((uch)PEEK())) { 814 count2 = p_count(p); 815 (void)REQUIRE(count <= count2, REG_BADBR); 816 } else /* single number with comma */ 817 count2 = INFINITY; 818 } else /* just a single number */ 819 count2 = count; 820 repeat(p, pos, count, count2); 821 if (!EATTWO('\\', '}')) { /* error heuristics */ 822 while (MORE() && !SEETWO('\\', '}')) 823 NEXT(); 824 (void)REQUIRE(MORE(), REG_EBRACE); 825 SETERROR(REG_BADBR); 826 } 827 } else if (c == '$') /* $ (but not \$) ends it */ 828 return (true); 829 830 return (false); 831 } 832 833 /* 834 - p_count - parse a repetition count 835 == static int p_count(struct parse *p); 836 */ 837 static int /* the value */ 838 p_count(struct parse *p) 839 { 840 int count = 0; 841 int ndigits = 0; 842 843 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 844 count = count*10 + (GETNEXT() - '0'); 845 ndigits++; 846 } 847 848 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 849 return(count); 850 } 851 852 /* 853 - p_bracket - parse a bracketed character list 854 == static void p_bracket(struct parse *p); 855 */ 856 static void 857 p_bracket(struct parse *p) 858 { 859 cset *cs; 860 wint_t ch; 861 862 /* Dept of Truly Sickening Special-Case Kludges */ 863 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 864 EMIT(OBOW, 0); 865 NEXTn(6); 866 return; 867 } 868 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 869 EMIT(OEOW, 0); 870 NEXTn(6); 871 return; 872 } 873 874 if ((cs = allocset(p)) == NULL) 875 return; 876 877 if (p->g->cflags®_ICASE) 878 cs->icase = 1; 879 if (EAT('^')) 880 cs->invert = 1; 881 if (EAT(']')) 882 CHadd(p, cs, ']'); 883 else if (EAT('-')) 884 CHadd(p, cs, '-'); 885 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 886 p_b_term(p, cs); 887 if (EAT('-')) 888 CHadd(p, cs, '-'); 889 (void)MUSTEAT(']', REG_EBRACK); 890 891 if (p->error != 0) /* don't mess things up further */ 892 return; 893 894 if (cs->invert && p->g->cflags®_NEWLINE) 895 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 896 897 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 898 ordinary(p, ch); 899 freeset(p, cs); 900 } else 901 EMIT(OANYOF, (int)(cs - p->g->sets)); 902 } 903 904 /* 905 - p_b_term - parse one term of a bracketed character list 906 == static void p_b_term(struct parse *p, cset *cs); 907 */ 908 static void 909 p_b_term(struct parse *p, cset *cs) 910 { 911 char c; 912 wint_t start, finish; 913 wint_t i; 914 struct xlocale_collate *table = 915 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 916 917 /* classify what we've got */ 918 switch ((MORE()) ? PEEK() : '\0') { 919 case '[': 920 c = (MORE2()) ? PEEK2() : '\0'; 921 break; 922 case '-': 923 SETERROR(REG_ERANGE); 924 return; /* NOTE RETURN */ 925 default: 926 c = '\0'; 927 break; 928 } 929 930 switch (c) { 931 case ':': /* character class */ 932 NEXT2(); 933 (void)REQUIRE(MORE(), REG_EBRACK); 934 c = PEEK(); 935 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 936 p_b_cclass(p, cs); 937 (void)REQUIRE(MORE(), REG_EBRACK); 938 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 939 break; 940 case '=': /* equivalence class */ 941 NEXT2(); 942 (void)REQUIRE(MORE(), REG_EBRACK); 943 c = PEEK(); 944 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 945 p_b_eclass(p, cs); 946 (void)REQUIRE(MORE(), REG_EBRACK); 947 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 948 break; 949 default: /* symbol, ordinary character, or range */ 950 start = p_b_symbol(p); 951 if (SEE('-') && MORE2() && PEEK2() != ']') { 952 /* range */ 953 NEXT(); 954 if (EAT('-')) 955 finish = '-'; 956 else 957 finish = p_b_symbol(p); 958 } else 959 finish = start; 960 if (start == finish) 961 CHadd(p, cs, start); 962 else { 963 if (table->__collate_load_error || MB_CUR_MAX > 1) { 964 (void)REQUIRE(start <= finish, REG_ERANGE); 965 CHaddrange(p, cs, start, finish); 966 } else { 967 (void)REQUIRE(__wcollate_range_cmp(start, finish) <= 0, REG_ERANGE); 968 for (i = 0; i <= UCHAR_MAX; i++) { 969 if ( __wcollate_range_cmp(start, i) <= 0 970 && __wcollate_range_cmp(i, finish) <= 0 971 ) 972 CHadd(p, cs, i); 973 } 974 } 975 } 976 break; 977 } 978 } 979 980 /* 981 - p_b_cclass - parse a character-class name and deal with it 982 == static void p_b_cclass(struct parse *p, cset *cs); 983 */ 984 static void 985 p_b_cclass(struct parse *p, cset *cs) 986 { 987 const char *sp = p->next; 988 size_t len; 989 wctype_t wct; 990 char clname[16]; 991 992 while (MORE() && isalpha((uch)PEEK())) 993 NEXT(); 994 len = p->next - sp; 995 if (len >= sizeof(clname) - 1) { 996 SETERROR(REG_ECTYPE); 997 return; 998 } 999 memcpy(clname, sp, len); 1000 clname[len] = '\0'; 1001 if ((wct = wctype(clname)) == 0) { 1002 SETERROR(REG_ECTYPE); 1003 return; 1004 } 1005 CHaddtype(p, cs, wct); 1006 } 1007 1008 /* 1009 - p_b_eclass - parse an equivalence-class name and deal with it 1010 == static void p_b_eclass(struct parse *p, cset *cs); 1011 * 1012 * This implementation is incomplete. xxx 1013 */ 1014 static void 1015 p_b_eclass(struct parse *p, cset *cs) 1016 { 1017 wint_t c; 1018 1019 c = p_b_coll_elem(p, '='); 1020 CHadd(p, cs, c); 1021 } 1022 1023 /* 1024 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 1025 == static wint_t p_b_symbol(struct parse *p); 1026 */ 1027 static wint_t /* value of symbol */ 1028 p_b_symbol(struct parse *p) 1029 { 1030 wint_t value; 1031 1032 (void)REQUIRE(MORE(), REG_EBRACK); 1033 if (!EATTWO('[', '.')) 1034 return(WGETNEXT()); 1035 1036 /* collating symbol */ 1037 value = p_b_coll_elem(p, '.'); 1038 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 1039 return(value); 1040 } 1041 1042 /* 1043 - p_b_coll_elem - parse a collating-element name and look it up 1044 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 1045 */ 1046 static wint_t /* value of collating element */ 1047 p_b_coll_elem(struct parse *p, 1048 wint_t endc) /* name ended by endc,']' */ 1049 { 1050 const char *sp = p->next; 1051 struct cname *cp; 1052 mbstate_t mbs; 1053 wchar_t wc; 1054 size_t clen, len; 1055 1056 while (MORE() && !SEETWO(endc, ']')) 1057 NEXT(); 1058 if (!MORE()) { 1059 SETERROR(REG_EBRACK); 1060 return(0); 1061 } 1062 len = p->next - sp; 1063 for (cp = cnames; cp->name != NULL; cp++) 1064 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1065 return(cp->code); /* known name */ 1066 memset(&mbs, 0, sizeof(mbs)); 1067 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 1068 return (wc); /* single character */ 1069 else if (clen == (size_t)-1 || clen == (size_t)-2) 1070 SETERROR(REG_ILLSEQ); 1071 else 1072 SETERROR(REG_ECOLLATE); /* neither */ 1073 return(0); 1074 } 1075 1076 /* 1077 - othercase - return the case counterpart of an alphabetic 1078 == static wint_t othercase(wint_t ch); 1079 */ 1080 static wint_t /* if no counterpart, return ch */ 1081 othercase(wint_t ch) 1082 { 1083 assert(iswalpha(ch)); 1084 if (iswupper(ch)) 1085 return(towlower(ch)); 1086 else if (iswlower(ch)) 1087 return(towupper(ch)); 1088 else /* peculiar, but could happen */ 1089 return(ch); 1090 } 1091 1092 /* 1093 - bothcases - emit a dualcase version of a two-case character 1094 == static void bothcases(struct parse *p, wint_t ch); 1095 * 1096 * Boy, is this implementation ever a kludge... 1097 */ 1098 static void 1099 bothcases(struct parse *p, wint_t ch) 1100 { 1101 const char *oldnext = p->next; 1102 const char *oldend = p->end; 1103 char bracket[3 + MB_LEN_MAX]; 1104 size_t n; 1105 mbstate_t mbs; 1106 1107 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1108 p->next = bracket; 1109 memset(&mbs, 0, sizeof(mbs)); 1110 n = wcrtomb(bracket, ch, &mbs); 1111 assert(n != (size_t)-1); 1112 bracket[n] = ']'; 1113 bracket[n + 1] = '\0'; 1114 p->end = bracket+n+1; 1115 p_bracket(p); 1116 assert(p->next == p->end); 1117 p->next = oldnext; 1118 p->end = oldend; 1119 } 1120 1121 /* 1122 - ordinary - emit an ordinary character 1123 == static void ordinary(struct parse *p, wint_t ch); 1124 */ 1125 static void 1126 ordinary(struct parse *p, wint_t ch) 1127 { 1128 cset *cs; 1129 1130 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 1131 bothcases(p, ch); 1132 else if ((ch & OPDMASK) == ch) 1133 EMIT(OCHAR, ch); 1134 else { 1135 /* 1136 * Kludge: character is too big to fit into an OCHAR operand. 1137 * Emit a singleton set. 1138 */ 1139 if ((cs = allocset(p)) == NULL) 1140 return; 1141 CHadd(p, cs, ch); 1142 EMIT(OANYOF, (int)(cs - p->g->sets)); 1143 } 1144 } 1145 1146 /* 1147 - nonnewline - emit REG_NEWLINE version of OANY 1148 == static void nonnewline(struct parse *p); 1149 * 1150 * Boy, is this implementation ever a kludge... 1151 */ 1152 static void 1153 nonnewline(struct parse *p) 1154 { 1155 const char *oldnext = p->next; 1156 const char *oldend = p->end; 1157 char bracket[4]; 1158 1159 p->next = bracket; 1160 p->end = bracket+3; 1161 bracket[0] = '^'; 1162 bracket[1] = '\n'; 1163 bracket[2] = ']'; 1164 bracket[3] = '\0'; 1165 p_bracket(p); 1166 assert(p->next == bracket+3); 1167 p->next = oldnext; 1168 p->end = oldend; 1169 } 1170 1171 /* 1172 - repeat - generate code for a bounded repetition, recursively if needed 1173 == static void repeat(struct parse *p, sopno start, int from, int to); 1174 */ 1175 static void 1176 repeat(struct parse *p, 1177 sopno start, /* operand from here to end of strip */ 1178 int from, /* repeated from this number */ 1179 int to) /* to this number of times (maybe INFINITY) */ 1180 { 1181 sopno finish = HERE(); 1182 # define N 2 1183 # define INF 3 1184 # define REP(f, t) ((f)*8 + (t)) 1185 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1186 sopno copy; 1187 1188 if (p->error != 0) /* head off possible runaway recursion */ 1189 return; 1190 1191 assert(from <= to); 1192 1193 switch (REP(MAP(from), MAP(to))) { 1194 case REP(0, 0): /* must be user doing this */ 1195 DROP(finish-start); /* drop the operand */ 1196 break; 1197 case REP(0, 1): /* as x{1,1}? */ 1198 case REP(0, N): /* as x{1,n}? */ 1199 case REP(0, INF): /* as x{1,}? */ 1200 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1201 INSERT(OCH_, start); /* offset is wrong... */ 1202 repeat(p, start+1, 1, to); 1203 ASTERN(OOR1, start); 1204 AHEAD(start); /* ... fix it */ 1205 EMIT(OOR2, 0); 1206 AHEAD(THERE()); 1207 ASTERN(O_CH, THERETHERE()); 1208 break; 1209 case REP(1, 1): /* trivial case */ 1210 /* done */ 1211 break; 1212 case REP(1, N): /* as x?x{1,n-1} */ 1213 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1214 INSERT(OCH_, start); 1215 ASTERN(OOR1, start); 1216 AHEAD(start); 1217 EMIT(OOR2, 0); /* offset very wrong... */ 1218 AHEAD(THERE()); /* ...so fix it */ 1219 ASTERN(O_CH, THERETHERE()); 1220 copy = dupl(p, start+1, finish+1); 1221 assert(copy == finish+4); 1222 repeat(p, copy, 1, to-1); 1223 break; 1224 case REP(1, INF): /* as x+ */ 1225 INSERT(OPLUS_, start); 1226 ASTERN(O_PLUS, start); 1227 break; 1228 case REP(N, N): /* as xx{m-1,n-1} */ 1229 copy = dupl(p, start, finish); 1230 repeat(p, copy, from-1, to-1); 1231 break; 1232 case REP(N, INF): /* as xx{n-1,INF} */ 1233 copy = dupl(p, start, finish); 1234 repeat(p, copy, from-1, to); 1235 break; 1236 default: /* "can't happen" */ 1237 SETERROR(REG_ASSERT); /* just in case */ 1238 break; 1239 } 1240 } 1241 1242 /* 1243 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1244 - character from the parse struct, signals a REG_ILLSEQ error if the 1245 - character can't be converted. Returns the number of bytes consumed. 1246 */ 1247 static wint_t 1248 wgetnext(struct parse *p) 1249 { 1250 mbstate_t mbs; 1251 wchar_t wc; 1252 size_t n; 1253 1254 memset(&mbs, 0, sizeof(mbs)); 1255 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1256 if (n == (size_t)-1 || n == (size_t)-2) { 1257 SETERROR(REG_ILLSEQ); 1258 return (0); 1259 } 1260 if (n == 0) 1261 n = 1; 1262 p->next += n; 1263 return (wc); 1264 } 1265 1266 /* 1267 - seterr - set an error condition 1268 == static int seterr(struct parse *p, int e); 1269 */ 1270 static int /* useless but makes type checking happy */ 1271 seterr(struct parse *p, int e) 1272 { 1273 if (p->error == 0) /* keep earliest error condition */ 1274 p->error = e; 1275 p->next = nuls; /* try to bring things to a halt */ 1276 p->end = nuls; 1277 return(0); /* make the return value well-defined */ 1278 } 1279 1280 /* 1281 - allocset - allocate a set of characters for [] 1282 == static cset *allocset(struct parse *p); 1283 */ 1284 static cset * 1285 allocset(struct parse *p) 1286 { 1287 cset *cs, *ncs; 1288 1289 ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs)); 1290 if (ncs == NULL) { 1291 SETERROR(REG_ESPACE); 1292 return (NULL); 1293 } 1294 p->g->sets = ncs; 1295 cs = &p->g->sets[p->g->ncsets++]; 1296 memset(cs, 0, sizeof(*cs)); 1297 1298 return(cs); 1299 } 1300 1301 /* 1302 - freeset - free a now-unused set 1303 == static void freeset(struct parse *p, cset *cs); 1304 */ 1305 static void 1306 freeset(struct parse *p, cset *cs) 1307 { 1308 cset *top = &p->g->sets[p->g->ncsets]; 1309 1310 free(cs->wides); 1311 free(cs->ranges); 1312 free(cs->types); 1313 memset(cs, 0, sizeof(*cs)); 1314 if (cs == top-1) /* recover only the easy case */ 1315 p->g->ncsets--; 1316 } 1317 1318 /* 1319 - singleton - Determine whether a set contains only one character, 1320 - returning it if so, otherwise returning OUT. 1321 */ 1322 static wint_t 1323 singleton(cset *cs) 1324 { 1325 wint_t i, s, n; 1326 1327 for (i = n = 0; i < NC; i++) 1328 if (CHIN(cs, i)) { 1329 n++; 1330 s = i; 1331 } 1332 if (n == 1) 1333 return (s); 1334 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1335 cs->icase == 0) 1336 return (cs->wides[0]); 1337 /* Don't bother handling the other cases. */ 1338 return (OUT); 1339 } 1340 1341 /* 1342 - CHadd - add character to character set. 1343 */ 1344 static void 1345 CHadd(struct parse *p, cset *cs, wint_t ch) 1346 { 1347 wint_t nch, *newwides; 1348 assert(ch >= 0); 1349 if (ch < NC) 1350 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1351 else { 1352 newwides = reallocarray(cs->wides, cs->nwides + 1, 1353 sizeof(*cs->wides)); 1354 if (newwides == NULL) { 1355 SETERROR(REG_ESPACE); 1356 return; 1357 } 1358 cs->wides = newwides; 1359 cs->wides[cs->nwides++] = ch; 1360 } 1361 if (cs->icase) { 1362 if ((nch = towlower(ch)) < NC) 1363 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1364 if ((nch = towupper(ch)) < NC) 1365 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1366 } 1367 } 1368 1369 /* 1370 - CHaddrange - add all characters in the range [min,max] to a character set. 1371 */ 1372 static void 1373 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1374 { 1375 crange *newranges; 1376 1377 for (; min < NC && min <= max; min++) 1378 CHadd(p, cs, min); 1379 if (min >= max) 1380 return; 1381 newranges = reallocarray(cs->ranges, cs->nranges + 1, 1382 sizeof(*cs->ranges)); 1383 if (newranges == NULL) { 1384 SETERROR(REG_ESPACE); 1385 return; 1386 } 1387 cs->ranges = newranges; 1388 cs->ranges[cs->nranges].min = min; 1389 cs->ranges[cs->nranges].max = max; 1390 cs->nranges++; 1391 } 1392 1393 /* 1394 - CHaddtype - add all characters of a certain type to a character set. 1395 */ 1396 static void 1397 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1398 { 1399 wint_t i; 1400 wctype_t *newtypes; 1401 1402 for (i = 0; i < NC; i++) 1403 if (iswctype(i, wct)) 1404 CHadd(p, cs, i); 1405 newtypes = reallocarray(cs->types, cs->ntypes + 1, 1406 sizeof(*cs->types)); 1407 if (newtypes == NULL) { 1408 SETERROR(REG_ESPACE); 1409 return; 1410 } 1411 cs->types = newtypes; 1412 cs->types[cs->ntypes++] = wct; 1413 } 1414 1415 /* 1416 - dupl - emit a duplicate of a bunch of sops 1417 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1418 */ 1419 static sopno /* start of duplicate */ 1420 dupl(struct parse *p, 1421 sopno start, /* from here */ 1422 sopno finish) /* to this less one */ 1423 { 1424 sopno ret = HERE(); 1425 sopno len = finish - start; 1426 1427 assert(finish >= start); 1428 if (len == 0) 1429 return(ret); 1430 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1431 return(ret); 1432 (void) memcpy((char *)(p->strip + p->slen), 1433 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1434 p->slen += len; 1435 return(ret); 1436 } 1437 1438 /* 1439 - doemit - emit a strip operator 1440 == static void doemit(struct parse *p, sop op, size_t opnd); 1441 * 1442 * It might seem better to implement this as a macro with a function as 1443 * hard-case backup, but it's just too big and messy unless there are 1444 * some changes to the data structures. Maybe later. 1445 */ 1446 static void 1447 doemit(struct parse *p, sop op, size_t opnd) 1448 { 1449 /* avoid making error situations worse */ 1450 if (p->error != 0) 1451 return; 1452 1453 /* deal with oversize operands ("can't happen", more or less) */ 1454 assert(opnd < 1<<OPSHIFT); 1455 1456 /* deal with undersized strip */ 1457 if (p->slen >= p->ssize) 1458 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1459 return; 1460 1461 /* finally, it's all reduced to the easy case */ 1462 p->strip[p->slen++] = SOP(op, opnd); 1463 } 1464 1465 /* 1466 - doinsert - insert a sop into the strip 1467 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1468 */ 1469 static void 1470 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1471 { 1472 sopno sn; 1473 sop s; 1474 int i; 1475 1476 /* avoid making error situations worse */ 1477 if (p->error != 0) 1478 return; 1479 1480 sn = HERE(); 1481 EMIT(op, opnd); /* do checks, ensure space */ 1482 assert(HERE() == sn+1); 1483 s = p->strip[sn]; 1484 1485 /* adjust paren pointers */ 1486 assert(pos > 0); 1487 for (i = 1; i < NPAREN; i++) { 1488 if (p->pbegin[i] >= pos) { 1489 p->pbegin[i]++; 1490 } 1491 if (p->pend[i] >= pos) { 1492 p->pend[i]++; 1493 } 1494 } 1495 1496 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1497 (HERE()-pos-1)*sizeof(sop)); 1498 p->strip[pos] = s; 1499 } 1500 1501 /* 1502 - dofwd - complete a forward reference 1503 == static void dofwd(struct parse *p, sopno pos, sop value); 1504 */ 1505 static void 1506 dofwd(struct parse *p, sopno pos, sop value) 1507 { 1508 /* avoid making error situations worse */ 1509 if (p->error != 0) 1510 return; 1511 1512 assert(value < 1<<OPSHIFT); 1513 p->strip[pos] = OP(p->strip[pos]) | value; 1514 } 1515 1516 /* 1517 - enlarge - enlarge the strip 1518 == static int enlarge(struct parse *p, sopno size); 1519 */ 1520 static int 1521 enlarge(struct parse *p, sopno size) 1522 { 1523 sop *sp; 1524 1525 if (p->ssize >= size) 1526 return 1; 1527 1528 sp = reallocarray(p->strip, size, sizeof(sop)); 1529 if (sp == NULL) { 1530 SETERROR(REG_ESPACE); 1531 return 0; 1532 } 1533 p->strip = sp; 1534 p->ssize = size; 1535 return 1; 1536 } 1537 1538 /* 1539 - stripsnug - compact the strip 1540 == static void stripsnug(struct parse *p, struct re_guts *g); 1541 */ 1542 static void 1543 stripsnug(struct parse *p, struct re_guts *g) 1544 { 1545 g->nstates = p->slen; 1546 g->strip = reallocarray((char *)p->strip, p->slen, sizeof(sop)); 1547 if (g->strip == NULL) { 1548 SETERROR(REG_ESPACE); 1549 g->strip = p->strip; 1550 } 1551 } 1552 1553 /* 1554 - findmust - fill in must and mlen with longest mandatory literal string 1555 == static void findmust(struct parse *p, struct re_guts *g); 1556 * 1557 * This algorithm could do fancy things like analyzing the operands of | 1558 * for common subsequences. Someday. This code is simple and finds most 1559 * of the interesting cases. 1560 * 1561 * Note that must and mlen got initialized during setup. 1562 */ 1563 static void 1564 findmust(struct parse *p, struct re_guts *g) 1565 { 1566 sop *scan; 1567 sop *start = NULL; 1568 sop *newstart = NULL; 1569 sopno newlen; 1570 sop s; 1571 char *cp; 1572 int offset; 1573 char buf[MB_LEN_MAX]; 1574 size_t clen; 1575 mbstate_t mbs; 1576 1577 /* avoid making error situations worse */ 1578 if (p->error != 0) 1579 return; 1580 1581 /* 1582 * It's not generally safe to do a ``char'' substring search on 1583 * multibyte character strings, but it's safe for at least 1584 * UTF-8 (see RFC 3629). 1585 */ 1586 if (MB_CUR_MAX > 1 && 1587 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1588 return; 1589 1590 /* find the longest OCHAR sequence in strip */ 1591 newlen = 0; 1592 offset = 0; 1593 g->moffset = 0; 1594 scan = g->strip + 1; 1595 do { 1596 s = *scan++; 1597 switch (OP(s)) { 1598 case OCHAR: /* sequence member */ 1599 if (newlen == 0) { /* new sequence */ 1600 memset(&mbs, 0, sizeof(mbs)); 1601 newstart = scan - 1; 1602 } 1603 clen = wcrtomb(buf, OPND(s), &mbs); 1604 if (clen == (size_t)-1) 1605 goto toohard; 1606 newlen += clen; 1607 break; 1608 case OPLUS_: /* things that don't break one */ 1609 case OLPAREN: 1610 case ORPAREN: 1611 break; 1612 case OQUEST_: /* things that must be skipped */ 1613 case OCH_: 1614 offset = altoffset(scan, offset); 1615 scan--; 1616 do { 1617 scan += OPND(s); 1618 s = *scan; 1619 /* assert() interferes w debug printouts */ 1620 if (OP(s) != O_QUEST && OP(s) != O_CH && 1621 OP(s) != OOR2) { 1622 g->iflags |= BAD; 1623 return; 1624 } 1625 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1626 /* FALLTHROUGH */ 1627 case OBOW: /* things that break a sequence */ 1628 case OEOW: 1629 case OBOL: 1630 case OEOL: 1631 case O_QUEST: 1632 case O_CH: 1633 case OEND: 1634 if (newlen > g->mlen) { /* ends one */ 1635 start = newstart; 1636 g->mlen = newlen; 1637 if (offset > -1) { 1638 g->moffset += offset; 1639 offset = newlen; 1640 } else 1641 g->moffset = offset; 1642 } else { 1643 if (offset > -1) 1644 offset += newlen; 1645 } 1646 newlen = 0; 1647 break; 1648 case OANY: 1649 if (newlen > g->mlen) { /* ends one */ 1650 start = newstart; 1651 g->mlen = newlen; 1652 if (offset > -1) { 1653 g->moffset += offset; 1654 offset = newlen; 1655 } else 1656 g->moffset = offset; 1657 } else { 1658 if (offset > -1) 1659 offset += newlen; 1660 } 1661 if (offset > -1) 1662 offset++; 1663 newlen = 0; 1664 break; 1665 case OANYOF: /* may or may not invalidate offset */ 1666 /* First, everything as OANY */ 1667 if (newlen > g->mlen) { /* ends one */ 1668 start = newstart; 1669 g->mlen = newlen; 1670 if (offset > -1) { 1671 g->moffset += offset; 1672 offset = newlen; 1673 } else 1674 g->moffset = offset; 1675 } else { 1676 if (offset > -1) 1677 offset += newlen; 1678 } 1679 if (offset > -1) 1680 offset++; 1681 newlen = 0; 1682 break; 1683 toohard: 1684 default: 1685 /* Anything here makes it impossible or too hard 1686 * to calculate the offset -- so we give up; 1687 * save the last known good offset, in case the 1688 * must sequence doesn't occur later. 1689 */ 1690 if (newlen > g->mlen) { /* ends one */ 1691 start = newstart; 1692 g->mlen = newlen; 1693 if (offset > -1) 1694 g->moffset += offset; 1695 else 1696 g->moffset = offset; 1697 } 1698 offset = -1; 1699 newlen = 0; 1700 break; 1701 } 1702 } while (OP(s) != OEND); 1703 1704 if (g->mlen == 0) { /* there isn't one */ 1705 g->moffset = -1; 1706 return; 1707 } 1708 1709 /* turn it into a character string */ 1710 g->must = malloc((size_t)g->mlen + 1); 1711 if (g->must == NULL) { /* argh; just forget it */ 1712 g->mlen = 0; 1713 g->moffset = -1; 1714 return; 1715 } 1716 cp = g->must; 1717 scan = start; 1718 memset(&mbs, 0, sizeof(mbs)); 1719 while (cp < g->must + g->mlen) { 1720 while (OP(s = *scan++) != OCHAR) 1721 continue; 1722 clen = wcrtomb(cp, OPND(s), &mbs); 1723 assert(clen != (size_t)-1); 1724 cp += clen; 1725 } 1726 assert(cp == g->must + g->mlen); 1727 *cp++ = '\0'; /* just on general principles */ 1728 } 1729 1730 /* 1731 - altoffset - choose biggest offset among multiple choices 1732 == static int altoffset(sop *scan, int offset); 1733 * 1734 * Compute, recursively if necessary, the largest offset among multiple 1735 * re paths. 1736 */ 1737 static int 1738 altoffset(sop *scan, int offset) 1739 { 1740 int largest; 1741 int try; 1742 sop s; 1743 1744 /* If we gave up already on offsets, return */ 1745 if (offset == -1) 1746 return -1; 1747 1748 largest = 0; 1749 try = 0; 1750 s = *scan++; 1751 while (OP(s) != O_QUEST && OP(s) != O_CH) { 1752 switch (OP(s)) { 1753 case OOR1: 1754 if (try > largest) 1755 largest = try; 1756 try = 0; 1757 break; 1758 case OQUEST_: 1759 case OCH_: 1760 try = altoffset(scan, try); 1761 if (try == -1) 1762 return -1; 1763 scan--; 1764 do { 1765 scan += OPND(s); 1766 s = *scan; 1767 if (OP(s) != O_QUEST && OP(s) != O_CH && 1768 OP(s) != OOR2) 1769 return -1; 1770 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1771 /* We must skip to the next position, or we'll 1772 * leave altoffset() too early. 1773 */ 1774 scan++; 1775 break; 1776 case OANYOF: 1777 case OCHAR: 1778 case OANY: 1779 try++; 1780 case OBOW: 1781 case OEOW: 1782 case OLPAREN: 1783 case ORPAREN: 1784 case OOR2: 1785 break; 1786 default: 1787 try = -1; 1788 break; 1789 } 1790 if (try == -1) 1791 return -1; 1792 s = *scan++; 1793 } 1794 1795 if (try > largest) 1796 largest = try; 1797 1798 return largest+offset; 1799 } 1800 1801 /* 1802 - computejumps - compute char jumps for BM scan 1803 == static void computejumps(struct parse *p, struct re_guts *g); 1804 * 1805 * This algorithm assumes g->must exists and is has size greater than 1806 * zero. It's based on the algorithm found on Computer Algorithms by 1807 * Sara Baase. 1808 * 1809 * A char jump is the number of characters one needs to jump based on 1810 * the value of the character from the text that was mismatched. 1811 */ 1812 static void 1813 computejumps(struct parse *p, struct re_guts *g) 1814 { 1815 int ch; 1816 int mindex; 1817 1818 /* Avoid making errors worse */ 1819 if (p->error != 0) 1820 return; 1821 1822 g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 1823 if (g->charjump == NULL) /* Not a fatal error */ 1824 return; 1825 /* Adjust for signed chars, if necessary */ 1826 g->charjump = &g->charjump[-(CHAR_MIN)]; 1827 1828 /* If the character does not exist in the pattern, the jump 1829 * is equal to the number of characters in the pattern. 1830 */ 1831 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 1832 g->charjump[ch] = g->mlen; 1833 1834 /* If the character does exist, compute the jump that would 1835 * take us to the last character in the pattern equal to it 1836 * (notice that we match right to left, so that last character 1837 * is the first one that would be matched). 1838 */ 1839 for (mindex = 0; mindex < g->mlen; mindex++) 1840 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 1841 } 1842 1843 /* 1844 - computematchjumps - compute match jumps for BM scan 1845 == static void computematchjumps(struct parse *p, struct re_guts *g); 1846 * 1847 * This algorithm assumes g->must exists and is has size greater than 1848 * zero. It's based on the algorithm found on Computer Algorithms by 1849 * Sara Baase. 1850 * 1851 * A match jump is the number of characters one needs to advance based 1852 * on the already-matched suffix. 1853 * Notice that all values here are minus (g->mlen-1), because of the way 1854 * the search algorithm works. 1855 */ 1856 static void 1857 computematchjumps(struct parse *p, struct re_guts *g) 1858 { 1859 int mindex; /* General "must" iterator */ 1860 int suffix; /* Keeps track of matching suffix */ 1861 int ssuffix; /* Keeps track of suffixes' suffix */ 1862 int* pmatches; /* pmatches[k] points to the next i 1863 * such that i+1...mlen is a substring 1864 * of k+1...k+mlen-i-1 1865 */ 1866 1867 /* Avoid making errors worse */ 1868 if (p->error != 0) 1869 return; 1870 1871 pmatches = (int*) malloc(g->mlen * sizeof(int)); 1872 if (pmatches == NULL) { 1873 g->matchjump = NULL; 1874 return; 1875 } 1876 1877 g->matchjump = (int*) malloc(g->mlen * sizeof(int)); 1878 if (g->matchjump == NULL) { /* Not a fatal error */ 1879 free(pmatches); 1880 return; 1881 } 1882 1883 /* Set maximum possible jump for each character in the pattern */ 1884 for (mindex = 0; mindex < g->mlen; mindex++) 1885 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 1886 1887 /* Compute pmatches[] */ 1888 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 1889 mindex--, suffix--) { 1890 pmatches[mindex] = suffix; 1891 1892 /* If a mismatch is found, interrupting the substring, 1893 * compute the matchjump for that position. If no 1894 * mismatch is found, then a text substring mismatched 1895 * against the suffix will also mismatch against the 1896 * substring. 1897 */ 1898 while (suffix < g->mlen 1899 && g->must[mindex] != g->must[suffix]) { 1900 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1901 g->mlen - mindex - 1); 1902 suffix = pmatches[suffix]; 1903 } 1904 } 1905 1906 /* Compute the matchjump up to the last substring found to jump 1907 * to the beginning of the largest must pattern prefix matching 1908 * it's own suffix. 1909 */ 1910 for (mindex = 0; mindex <= suffix; mindex++) 1911 g->matchjump[mindex] = MIN(g->matchjump[mindex], 1912 g->mlen + suffix - mindex); 1913 1914 ssuffix = pmatches[suffix]; 1915 while (suffix < g->mlen) { 1916 while (suffix <= ssuffix && suffix < g->mlen) { 1917 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1918 g->mlen + ssuffix - suffix); 1919 suffix++; 1920 } 1921 if (suffix < g->mlen) 1922 ssuffix = pmatches[ssuffix]; 1923 } 1924 1925 free(pmatches); 1926 } 1927 1928 /* 1929 - pluscount - count + nesting 1930 == static sopno pluscount(struct parse *p, struct re_guts *g); 1931 */ 1932 static sopno /* nesting depth */ 1933 pluscount(struct parse *p, struct re_guts *g) 1934 { 1935 sop *scan; 1936 sop s; 1937 sopno plusnest = 0; 1938 sopno maxnest = 0; 1939 1940 if (p->error != 0) 1941 return(0); /* there may not be an OEND */ 1942 1943 scan = g->strip + 1; 1944 do { 1945 s = *scan++; 1946 switch (OP(s)) { 1947 case OPLUS_: 1948 plusnest++; 1949 break; 1950 case O_PLUS: 1951 if (plusnest > maxnest) 1952 maxnest = plusnest; 1953 plusnest--; 1954 break; 1955 } 1956 } while (OP(s) != OEND); 1957 if (plusnest != 0) 1958 g->iflags |= BAD; 1959 return(maxnest); 1960 } 1961