1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * Copyright (c) 2011 The FreeBSD Foundation 7 * All rights reserved. 8 * Portions of this software were developed by David Chisnall 9 * under sponsorship from the FreeBSD Foundation. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * Henry Spencer. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 39 */ 40 41 #if defined(LIBC_SCCS) && !defined(lint) 42 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 43 #endif /* LIBC_SCCS and not lint */ 44 #include <sys/cdefs.h> 45 __FBSDID("$FreeBSD$"); 46 47 #include <sys/types.h> 48 #include <stdio.h> 49 #include <string.h> 50 #include <ctype.h> 51 #include <limits.h> 52 #include <stdlib.h> 53 #include <regex.h> 54 #include <stdbool.h> 55 #include <wchar.h> 56 #include <wctype.h> 57 58 #include "collate.h" 59 60 #include "utils.h" 61 #include "regex2.h" 62 63 #include "cname.h" 64 65 /* 66 * Branching context, used to keep track of branch state for all of the branch- 67 * aware functions. In addition to keeping track of branch positions for the 68 * p_branch_* functions, we use this to simplify some clumsiness in BREs for 69 * detection of whether ^ is acting as an anchor or being used erroneously and 70 * also for whether we're in a sub-expression or not. 71 */ 72 struct branchc { 73 sopno start; 74 sopno back; 75 sopno fwd; 76 77 int nbranch; 78 int nchain; 79 bool outer; 80 bool terminate; 81 }; 82 83 /* 84 * parse structure, passed up and down to avoid global variables and 85 * other clumsinesses 86 */ 87 struct parse { 88 const char *next; /* next character in RE */ 89 const char *end; /* end of string (-> NUL normally) */ 90 int error; /* has an error been seen? */ 91 sop *strip; /* malloced strip */ 92 sopno ssize; /* malloced strip size (allocated) */ 93 sopno slen; /* malloced strip length (used) */ 94 int ncsalloc; /* number of csets allocated */ 95 struct re_guts *g; 96 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 97 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 98 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 99 bool allowbranch; /* can this expression branch? */ 100 bool bre; /* convenience; is this a BRE? */ 101 bool (*parse_expr)(struct parse *, struct branchc *); 102 void (*pre_parse)(struct parse *, struct branchc *); 103 void (*post_parse)(struct parse *, struct branchc *); 104 }; 105 106 /* ========= begin header generated by ./mkh ========= */ 107 #ifdef __cplusplus 108 extern "C" { 109 #endif 110 111 /* === regcomp.c === */ 112 static bool p_ere_exp(struct parse *p, struct branchc *bc); 113 static void p_str(struct parse *p); 114 static int p_branch_eat_delim(struct parse *p, struct branchc *bc); 115 static void p_branch_ins_offset(struct parse *p, struct branchc *bc); 116 static void p_branch_fix_tail(struct parse *p, struct branchc *bc); 117 static bool p_branch_empty(struct parse *p, struct branchc *bc); 118 static bool p_branch_do(struct parse *p, struct branchc *bc); 119 static void p_bre_pre_parse(struct parse *p, struct branchc *bc); 120 static void p_bre_post_parse(struct parse *p, struct branchc *bc); 121 static void p_re(struct parse *p, int end1, int end2); 122 static bool p_simp_re(struct parse *p, struct branchc *bc); 123 static int p_count(struct parse *p); 124 static void p_bracket(struct parse *p); 125 static void p_b_term(struct parse *p, cset *cs); 126 static void p_b_cclass(struct parse *p, cset *cs); 127 static void p_b_eclass(struct parse *p, cset *cs); 128 static wint_t p_b_symbol(struct parse *p); 129 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 130 static wint_t othercase(wint_t ch); 131 static void bothcases(struct parse *p, wint_t ch); 132 static void ordinary(struct parse *p, wint_t ch); 133 static void nonnewline(struct parse *p); 134 static void repeat(struct parse *p, sopno start, int from, int to); 135 static int seterr(struct parse *p, int e); 136 static cset *allocset(struct parse *p); 137 static void freeset(struct parse *p, cset *cs); 138 static void CHadd(struct parse *p, cset *cs, wint_t ch); 139 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 140 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 141 static wint_t singleton(cset *cs); 142 static sopno dupl(struct parse *p, sopno start, sopno finish); 143 static void doemit(struct parse *p, sop op, size_t opnd); 144 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 145 static void dofwd(struct parse *p, sopno pos, sop value); 146 static int enlarge(struct parse *p, sopno size); 147 static void stripsnug(struct parse *p, struct re_guts *g); 148 static void findmust(struct parse *p, struct re_guts *g); 149 static int altoffset(sop *scan, int offset); 150 static void computejumps(struct parse *p, struct re_guts *g); 151 static void computematchjumps(struct parse *p, struct re_guts *g); 152 static sopno pluscount(struct parse *p, struct re_guts *g); 153 static wint_t wgetnext(struct parse *p); 154 155 #ifdef __cplusplus 156 } 157 #endif 158 /* ========= end header generated by ./mkh ========= */ 159 160 static char nuls[10]; /* place to point scanner in event of error */ 161 162 /* 163 * macros for use with parse structure 164 * BEWARE: these know that the parse structure is named `p' !!! 165 */ 166 #define PEEK() (*p->next) 167 #define PEEK2() (*(p->next+1)) 168 #define MORE() (p->next < p->end) 169 #define MORE2() (p->next+1 < p->end) 170 #define SEE(c) (MORE() && PEEK() == (c)) 171 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 172 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) 173 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 174 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 175 #define NEXT() (p->next++) 176 #define NEXT2() (p->next += 2) 177 #define NEXTn(n) (p->next += (n)) 178 #define GETNEXT() (*p->next++) 179 #define WGETNEXT() wgetnext(p) 180 #define SETERROR(e) seterr(p, (e)) 181 #define REQUIRE(co, e) ((co) || SETERROR(e)) 182 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 183 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 184 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 185 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 186 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 187 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 188 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 189 #define HERE() (p->slen) 190 #define THERE() (p->slen - 1) 191 #define THERETHERE() (p->slen - 2) 192 #define DROP(n) (p->slen -= (n)) 193 194 #ifndef NDEBUG 195 static int never = 0; /* for use in asserts; shuts lint up */ 196 #else 197 #define never 0 /* some <assert.h>s have bugs too */ 198 #endif 199 200 /* Macro used by computejump()/computematchjump() */ 201 #define MIN(a,b) ((a)<(b)?(a):(b)) 202 203 /* 204 - regcomp - interface for parser and compilation 205 = extern int regcomp(regex_t *, const char *, int); 206 = #define REG_BASIC 0000 207 = #define REG_EXTENDED 0001 208 = #define REG_ICASE 0002 209 = #define REG_NOSUB 0004 210 = #define REG_NEWLINE 0010 211 = #define REG_NOSPEC 0020 212 = #define REG_PEND 0040 213 = #define REG_DUMP 0200 214 */ 215 int /* 0 success, otherwise REG_something */ 216 regcomp(regex_t * __restrict preg, 217 const char * __restrict pattern, 218 int cflags) 219 { 220 struct parse pa; 221 struct re_guts *g; 222 struct parse *p = &pa; 223 int i; 224 size_t len; 225 size_t maxlen; 226 #ifdef REDEBUG 227 # define GOODFLAGS(f) (f) 228 #else 229 # define GOODFLAGS(f) ((f)&~REG_DUMP) 230 #endif 231 232 cflags = GOODFLAGS(cflags); 233 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 234 return(REG_INVARG); 235 236 if (cflags®_PEND) { 237 if (preg->re_endp < pattern) 238 return(REG_INVARG); 239 len = preg->re_endp - pattern; 240 } else 241 len = strlen(pattern); 242 243 /* do the mallocs early so failure handling is easy */ 244 g = (struct re_guts *)malloc(sizeof(struct re_guts)); 245 if (g == NULL) 246 return(REG_ESPACE); 247 /* 248 * Limit the pattern space to avoid a 32-bit overflow on buffer 249 * extension. Also avoid any signed overflow in case of conversion 250 * so make the real limit based on a 31-bit overflow. 251 * 252 * Likely not applicable on 64-bit systems but handle the case 253 * generically (who are we to stop people from using ~715MB+ 254 * patterns?). 255 */ 256 maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3; 257 if (len >= maxlen) { 258 free((char *)g); 259 return(REG_ESPACE); 260 } 261 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 262 assert(p->ssize >= len); 263 264 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 265 p->slen = 0; 266 if (p->strip == NULL) { 267 free((char *)g); 268 return(REG_ESPACE); 269 } 270 271 /* set things up */ 272 p->g = g; 273 p->next = pattern; /* convenience; we do not modify it */ 274 p->end = p->next + len; 275 p->error = 0; 276 p->ncsalloc = 0; 277 for (i = 0; i < NPAREN; i++) { 278 p->pbegin[i] = 0; 279 p->pend[i] = 0; 280 } 281 if (cflags & REG_EXTENDED) { 282 p->allowbranch = true; 283 p->bre = false; 284 p->parse_expr = p_ere_exp; 285 p->pre_parse = NULL; 286 p->post_parse = NULL; 287 } else { 288 p->allowbranch = false; 289 p->bre = true; 290 p->parse_expr = p_simp_re; 291 p->pre_parse = p_bre_pre_parse; 292 p->post_parse = p_bre_post_parse; 293 } 294 g->sets = NULL; 295 g->ncsets = 0; 296 g->cflags = cflags; 297 g->iflags = 0; 298 g->nbol = 0; 299 g->neol = 0; 300 g->must = NULL; 301 g->moffset = -1; 302 g->charjump = NULL; 303 g->matchjump = NULL; 304 g->mlen = 0; 305 g->nsub = 0; 306 g->backrefs = 0; 307 308 /* do it */ 309 EMIT(OEND, 0); 310 g->firststate = THERE(); 311 if (cflags & REG_NOSPEC) 312 p_str(p); 313 else 314 p_re(p, OUT, OUT); 315 EMIT(OEND, 0); 316 g->laststate = THERE(); 317 318 /* tidy up loose ends and fill things in */ 319 stripsnug(p, g); 320 findmust(p, g); 321 /* only use Boyer-Moore algorithm if the pattern is bigger 322 * than three characters 323 */ 324 if(g->mlen > 3) { 325 computejumps(p, g); 326 computematchjumps(p, g); 327 if(g->matchjump == NULL && g->charjump != NULL) { 328 free(g->charjump); 329 g->charjump = NULL; 330 } 331 } 332 g->nplus = pluscount(p, g); 333 g->magic = MAGIC2; 334 preg->re_nsub = g->nsub; 335 preg->re_g = g; 336 preg->re_magic = MAGIC1; 337 #ifndef REDEBUG 338 /* not debugging, so can't rely on the assert() in regexec() */ 339 if (g->iflags&BAD) 340 SETERROR(REG_ASSERT); 341 #endif 342 343 /* win or lose, we're done */ 344 if (p->error != 0) /* lose */ 345 regfree(preg); 346 return(p->error); 347 } 348 349 /* 350 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, 351 - return whether we should terminate or not 352 == static bool p_ere_exp(struct parse *p); 353 */ 354 static bool 355 p_ere_exp(struct parse *p, struct branchc *bc) 356 { 357 char c; 358 wint_t wc; 359 sopno pos; 360 int count; 361 int count2; 362 sopno subno; 363 int wascaret = 0; 364 365 assert(MORE()); /* caller should have ensured this */ 366 c = GETNEXT(); 367 368 pos = HERE(); 369 switch (c) { 370 case '(': 371 (void)REQUIRE(MORE(), REG_EPAREN); 372 p->g->nsub++; 373 subno = p->g->nsub; 374 if (subno < NPAREN) 375 p->pbegin[subno] = HERE(); 376 EMIT(OLPAREN, subno); 377 if (!SEE(')')) 378 p_re(p, ')', IGN); 379 if (subno < NPAREN) { 380 p->pend[subno] = HERE(); 381 assert(p->pend[subno] != 0); 382 } 383 EMIT(ORPAREN, subno); 384 (void)MUSTEAT(')', REG_EPAREN); 385 break; 386 #ifndef POSIX_MISTAKE 387 case ')': /* happens only if no current unmatched ( */ 388 /* 389 * You may ask, why the ifndef? Because I didn't notice 390 * this until slightly too late for 1003.2, and none of the 391 * other 1003.2 regular-expression reviewers noticed it at 392 * all. So an unmatched ) is legal POSIX, at least until 393 * we can get it fixed. 394 */ 395 SETERROR(REG_EPAREN); 396 break; 397 #endif 398 case '^': 399 EMIT(OBOL, 0); 400 p->g->iflags |= USEBOL; 401 p->g->nbol++; 402 wascaret = 1; 403 break; 404 case '$': 405 EMIT(OEOL, 0); 406 p->g->iflags |= USEEOL; 407 p->g->neol++; 408 break; 409 case '|': 410 SETERROR(REG_EMPTY); 411 break; 412 case '*': 413 case '+': 414 case '?': 415 SETERROR(REG_BADRPT); 416 break; 417 case '.': 418 if (p->g->cflags®_NEWLINE) 419 nonnewline(p); 420 else 421 EMIT(OANY, 0); 422 break; 423 case '[': 424 p_bracket(p); 425 break; 426 case '\\': 427 (void)REQUIRE(MORE(), REG_EESCAPE); 428 wc = WGETNEXT(); 429 switch (wc) { 430 case '<': 431 EMIT(OBOW, 0); 432 break; 433 case '>': 434 EMIT(OEOW, 0); 435 break; 436 default: 437 ordinary(p, wc); 438 break; 439 } 440 break; 441 case '{': /* okay as ordinary except if digit follows */ 442 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 443 /* FALLTHROUGH */ 444 default: 445 if (p->error != 0) 446 return (false); 447 p->next--; 448 wc = WGETNEXT(); 449 ordinary(p, wc); 450 break; 451 } 452 453 if (!MORE()) 454 return (false); 455 c = PEEK(); 456 /* we call { a repetition if followed by a digit */ 457 if (!( c == '*' || c == '+' || c == '?' || 458 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 459 return (false); /* no repetition, we're done */ 460 NEXT(); 461 462 (void)REQUIRE(!wascaret, REG_BADRPT); 463 switch (c) { 464 case '*': /* implemented as +? */ 465 /* this case does not require the (y|) trick, noKLUDGE */ 466 INSERT(OPLUS_, pos); 467 ASTERN(O_PLUS, pos); 468 INSERT(OQUEST_, pos); 469 ASTERN(O_QUEST, pos); 470 break; 471 case '+': 472 INSERT(OPLUS_, pos); 473 ASTERN(O_PLUS, pos); 474 break; 475 case '?': 476 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 477 INSERT(OCH_, pos); /* offset slightly wrong */ 478 ASTERN(OOR1, pos); /* this one's right */ 479 AHEAD(pos); /* fix the OCH_ */ 480 EMIT(OOR2, 0); /* offset very wrong... */ 481 AHEAD(THERE()); /* ...so fix it */ 482 ASTERN(O_CH, THERETHERE()); 483 break; 484 case '{': 485 count = p_count(p); 486 if (EAT(',')) { 487 if (isdigit((uch)PEEK())) { 488 count2 = p_count(p); 489 (void)REQUIRE(count <= count2, REG_BADBR); 490 } else /* single number with comma */ 491 count2 = INFINITY; 492 } else /* just a single number */ 493 count2 = count; 494 repeat(p, pos, count, count2); 495 if (!EAT('}')) { /* error heuristics */ 496 while (MORE() && PEEK() != '}') 497 NEXT(); 498 (void)REQUIRE(MORE(), REG_EBRACE); 499 SETERROR(REG_BADBR); 500 } 501 break; 502 } 503 504 if (!MORE()) 505 return (false); 506 c = PEEK(); 507 if (!( c == '*' || c == '+' || c == '?' || 508 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 509 return (false); 510 SETERROR(REG_BADRPT); 511 return (false); 512 } 513 514 /* 515 - p_str - string (no metacharacters) "parser" 516 == static void p_str(struct parse *p); 517 */ 518 static void 519 p_str(struct parse *p) 520 { 521 (void)REQUIRE(MORE(), REG_EMPTY); 522 while (MORE()) 523 ordinary(p, WGETNEXT()); 524 } 525 526 /* 527 * Eat consecutive branch delimiters for the kind of expression that we are 528 * parsing, return the number of delimiters that we ate. 529 */ 530 static int 531 p_branch_eat_delim(struct parse *p, struct branchc *bc) 532 { 533 int nskip; 534 535 nskip = 0; 536 while (EAT('|')) 537 ++nskip; 538 return (nskip); 539 } 540 541 /* 542 * Insert necessary branch book-keeping operations. This emits a 543 * bogus 'next' offset, since we still have more to parse 544 */ 545 static void 546 p_branch_ins_offset(struct parse *p, struct branchc *bc) 547 { 548 549 if (bc->nbranch == 0) { 550 INSERT(OCH_, bc->start); /* offset is wrong */ 551 bc->fwd = bc->start; 552 bc->back = bc->start; 553 } 554 555 ASTERN(OOR1, bc->back); 556 bc->back = THERE(); 557 AHEAD(bc->fwd); /* fix previous offset */ 558 bc->fwd = HERE(); 559 EMIT(OOR2, 0); /* offset is very wrong */ 560 ++bc->nbranch; 561 } 562 563 /* 564 * Fix the offset of the tail branch, if we actually had any branches. 565 * This is to correct the bogus placeholder offset that we use. 566 */ 567 static void 568 p_branch_fix_tail(struct parse *p, struct branchc *bc) 569 { 570 571 /* Fix bogus offset at the tail if we actually have branches */ 572 if (bc->nbranch > 0) { 573 AHEAD(bc->fwd); 574 ASTERN(O_CH, bc->back); 575 } 576 } 577 578 /* 579 * Signal to the parser that an empty branch has been encountered; this will, 580 * in the future, be used to allow for more permissive behavior with empty 581 * branches. The return value should indicate whether parsing may continue 582 * or not. 583 */ 584 static bool 585 p_branch_empty(struct parse *p, struct branchc *bc) 586 { 587 588 SETERROR(REG_EMPTY); 589 return (false); 590 } 591 592 /* 593 * Take care of any branching requirements. This includes inserting the 594 * appropriate branching instructions as well as eating all of the branch 595 * delimiters until we either run out of pattern or need to parse more pattern. 596 */ 597 static bool 598 p_branch_do(struct parse *p, struct branchc *bc) 599 { 600 int ate = 0; 601 602 ate = p_branch_eat_delim(p, bc); 603 if (ate == 0) 604 return (false); 605 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) 606 /* 607 * Halt parsing only if we have an empty branch and p_branch_empty 608 * indicates that we must not continue. In the future, this will not 609 * necessarily be an error. 610 */ 611 return (false); 612 p_branch_ins_offset(p, bc); 613 614 return (true); 615 } 616 617 static void 618 p_bre_pre_parse(struct parse *p, struct branchc *bc) 619 { 620 621 (void) bc; 622 /* 623 * Does not move cleanly into expression parser because of 624 * ordinary interpration of * at the beginning position of 625 * an expression. 626 */ 627 if (EAT('^')) { 628 EMIT(OBOL, 0); 629 p->g->iflags |= USEBOL; 630 p->g->nbol++; 631 } 632 } 633 634 static void 635 p_bre_post_parse(struct parse *p, struct branchc *bc) 636 { 637 638 /* Expression is terminating due to EOL token */ 639 if (bc->terminate) { 640 DROP(1); 641 EMIT(OEOL, 0); 642 p->g->iflags |= USEEOL; 643 p->g->neol++; 644 } 645 } 646 647 /* 648 - p_re - Top level parser, concatenation and BRE anchoring 649 == static void p_re(struct parse *p, int end1, int end2); 650 * Giving end1 as OUT essentially eliminates the end1/end2 check. 651 * 652 * This implementation is a bit of a kludge, in that a trailing $ is first 653 * taken as an ordinary character and then revised to be an anchor. 654 * The amount of lookahead needed to avoid this kludge is excessive. 655 */ 656 static void 657 p_re(struct parse *p, 658 int end1, /* first terminating character */ 659 int end2) /* second terminating character; ignored for EREs */ 660 { 661 struct branchc bc; 662 663 bc.nbranch = 0; 664 if (end1 == OUT && end2 == OUT) 665 bc.outer = true; 666 else 667 bc.outer = false; 668 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) 669 for (;;) { 670 bc.start = HERE(); 671 bc.nchain = 0; 672 bc.terminate = false; 673 if (p->pre_parse != NULL) 674 p->pre_parse(p, &bc); 675 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) { 676 bc.terminate = p->parse_expr(p, &bc); 677 ++bc.nchain; 678 } 679 if (p->post_parse != NULL) 680 p->post_parse(p, &bc); 681 (void) REQUIRE(HERE() != bc.start, REG_EMPTY); 682 if (!p->allowbranch) 683 break; 684 /* 685 * p_branch_do's return value indicates whether we should 686 * continue parsing or not. This is both for correctness and 687 * a slight optimization, because it will check if we've 688 * encountered an empty branch or the end of the string 689 * immediately following a branch delimiter. 690 */ 691 if (!p_branch_do(p, &bc)) 692 break; 693 } 694 #undef SEE_END 695 if (p->allowbranch) 696 p_branch_fix_tail(p, &bc); 697 assert(!MORE() || SEE(end1)); 698 } 699 700 /* 701 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 702 == static bool p_simp_re(struct parse *p, struct branchc *bc); 703 */ 704 static bool /* was the simple RE an unbackslashed $? */ 705 p_simp_re(struct parse *p, struct branchc *bc) 706 { 707 int c; 708 int count; 709 int count2; 710 sopno pos; 711 int i; 712 wint_t wc; 713 sopno subno; 714 # define BACKSL (1<<CHAR_BIT) 715 716 pos = HERE(); /* repetition op, if any, covers from here */ 717 718 assert(MORE()); /* caller should have ensured this */ 719 c = GETNEXT(); 720 if (c == '\\') { 721 (void)REQUIRE(MORE(), REG_EESCAPE); 722 c = BACKSL | GETNEXT(); 723 } 724 switch (c) { 725 case '.': 726 if (p->g->cflags®_NEWLINE) 727 nonnewline(p); 728 else 729 EMIT(OANY, 0); 730 break; 731 case '[': 732 p_bracket(p); 733 break; 734 case BACKSL|'<': 735 EMIT(OBOW, 0); 736 break; 737 case BACKSL|'>': 738 EMIT(OEOW, 0); 739 break; 740 case BACKSL|'{': 741 SETERROR(REG_BADRPT); 742 break; 743 case BACKSL|'(': 744 p->g->nsub++; 745 subno = p->g->nsub; 746 if (subno < NPAREN) 747 p->pbegin[subno] = HERE(); 748 EMIT(OLPAREN, subno); 749 /* the MORE here is an error heuristic */ 750 if (MORE() && !SEETWO('\\', ')')) 751 p_re(p, '\\', ')'); 752 if (subno < NPAREN) { 753 p->pend[subno] = HERE(); 754 assert(p->pend[subno] != 0); 755 } 756 EMIT(ORPAREN, subno); 757 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 758 break; 759 case BACKSL|')': /* should not get here -- must be user */ 760 case BACKSL|'}': 761 SETERROR(REG_EPAREN); 762 break; 763 case BACKSL|'1': 764 case BACKSL|'2': 765 case BACKSL|'3': 766 case BACKSL|'4': 767 case BACKSL|'5': 768 case BACKSL|'6': 769 case BACKSL|'7': 770 case BACKSL|'8': 771 case BACKSL|'9': 772 i = (c&~BACKSL) - '0'; 773 assert(i < NPAREN); 774 if (p->pend[i] != 0) { 775 assert(i <= p->g->nsub); 776 EMIT(OBACK_, i); 777 assert(p->pbegin[i] != 0); 778 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 779 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 780 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 781 EMIT(O_BACK, i); 782 } else 783 SETERROR(REG_ESUBREG); 784 p->g->backrefs = 1; 785 break; 786 case '*': 787 /* 788 * Ordinary if used as the first character beyond BOL anchor of 789 * a (sub-)expression, counts as a bad repetition operator if it 790 * appears otherwise. 791 */ 792 (void)REQUIRE(bc->nchain == 0, REG_BADRPT); 793 /* FALLTHROUGH */ 794 default: 795 if (p->error != 0) 796 return (false); /* Definitely not $... */ 797 p->next--; 798 wc = WGETNEXT(); 799 ordinary(p, wc); 800 break; 801 } 802 803 if (EAT('*')) { /* implemented as +? */ 804 /* this case does not require the (y|) trick, noKLUDGE */ 805 INSERT(OPLUS_, pos); 806 ASTERN(O_PLUS, pos); 807 INSERT(OQUEST_, pos); 808 ASTERN(O_QUEST, pos); 809 } else if (EATTWO('\\', '{')) { 810 count = p_count(p); 811 if (EAT(',')) { 812 if (MORE() && isdigit((uch)PEEK())) { 813 count2 = p_count(p); 814 (void)REQUIRE(count <= count2, REG_BADBR); 815 } else /* single number with comma */ 816 count2 = INFINITY; 817 } else /* just a single number */ 818 count2 = count; 819 repeat(p, pos, count, count2); 820 if (!EATTWO('\\', '}')) { /* error heuristics */ 821 while (MORE() && !SEETWO('\\', '}')) 822 NEXT(); 823 (void)REQUIRE(MORE(), REG_EBRACE); 824 SETERROR(REG_BADBR); 825 } 826 } else if (c == '$') /* $ (but not \$) ends it */ 827 return (true); 828 829 return (false); 830 } 831 832 /* 833 - p_count - parse a repetition count 834 == static int p_count(struct parse *p); 835 */ 836 static int /* the value */ 837 p_count(struct parse *p) 838 { 839 int count = 0; 840 int ndigits = 0; 841 842 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 843 count = count*10 + (GETNEXT() - '0'); 844 ndigits++; 845 } 846 847 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 848 return(count); 849 } 850 851 /* 852 - p_bracket - parse a bracketed character list 853 == static void p_bracket(struct parse *p); 854 */ 855 static void 856 p_bracket(struct parse *p) 857 { 858 cset *cs; 859 wint_t ch; 860 861 /* Dept of Truly Sickening Special-Case Kludges */ 862 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 863 EMIT(OBOW, 0); 864 NEXTn(6); 865 return; 866 } 867 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 868 EMIT(OEOW, 0); 869 NEXTn(6); 870 return; 871 } 872 873 if ((cs = allocset(p)) == NULL) 874 return; 875 876 if (p->g->cflags®_ICASE) 877 cs->icase = 1; 878 if (EAT('^')) 879 cs->invert = 1; 880 if (EAT(']')) 881 CHadd(p, cs, ']'); 882 else if (EAT('-')) 883 CHadd(p, cs, '-'); 884 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 885 p_b_term(p, cs); 886 if (EAT('-')) 887 CHadd(p, cs, '-'); 888 (void)MUSTEAT(']', REG_EBRACK); 889 890 if (p->error != 0) /* don't mess things up further */ 891 return; 892 893 if (cs->invert && p->g->cflags®_NEWLINE) 894 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 895 896 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 897 ordinary(p, ch); 898 freeset(p, cs); 899 } else 900 EMIT(OANYOF, (int)(cs - p->g->sets)); 901 } 902 903 /* 904 - p_b_term - parse one term of a bracketed character list 905 == static void p_b_term(struct parse *p, cset *cs); 906 */ 907 static void 908 p_b_term(struct parse *p, cset *cs) 909 { 910 char c; 911 wint_t start, finish; 912 wint_t i; 913 struct xlocale_collate *table = 914 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 915 916 /* classify what we've got */ 917 switch ((MORE()) ? PEEK() : '\0') { 918 case '[': 919 c = (MORE2()) ? PEEK2() : '\0'; 920 break; 921 case '-': 922 SETERROR(REG_ERANGE); 923 return; /* NOTE RETURN */ 924 default: 925 c = '\0'; 926 break; 927 } 928 929 switch (c) { 930 case ':': /* character class */ 931 NEXT2(); 932 (void)REQUIRE(MORE(), REG_EBRACK); 933 c = PEEK(); 934 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 935 p_b_cclass(p, cs); 936 (void)REQUIRE(MORE(), REG_EBRACK); 937 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 938 break; 939 case '=': /* equivalence class */ 940 NEXT2(); 941 (void)REQUIRE(MORE(), REG_EBRACK); 942 c = PEEK(); 943 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 944 p_b_eclass(p, cs); 945 (void)REQUIRE(MORE(), REG_EBRACK); 946 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 947 break; 948 default: /* symbol, ordinary character, or range */ 949 start = p_b_symbol(p); 950 if (SEE('-') && MORE2() && PEEK2() != ']') { 951 /* range */ 952 NEXT(); 953 if (EAT('-')) 954 finish = '-'; 955 else 956 finish = p_b_symbol(p); 957 } else 958 finish = start; 959 if (start == finish) 960 CHadd(p, cs, start); 961 else { 962 if (table->__collate_load_error || MB_CUR_MAX > 1) { 963 (void)REQUIRE(start <= finish, REG_ERANGE); 964 CHaddrange(p, cs, start, finish); 965 } else { 966 (void)REQUIRE(__wcollate_range_cmp(start, finish) <= 0, REG_ERANGE); 967 for (i = 0; i <= UCHAR_MAX; i++) { 968 if ( __wcollate_range_cmp(start, i) <= 0 969 && __wcollate_range_cmp(i, finish) <= 0 970 ) 971 CHadd(p, cs, i); 972 } 973 } 974 } 975 break; 976 } 977 } 978 979 /* 980 - p_b_cclass - parse a character-class name and deal with it 981 == static void p_b_cclass(struct parse *p, cset *cs); 982 */ 983 static void 984 p_b_cclass(struct parse *p, cset *cs) 985 { 986 const char *sp = p->next; 987 size_t len; 988 wctype_t wct; 989 char clname[16]; 990 991 while (MORE() && isalpha((uch)PEEK())) 992 NEXT(); 993 len = p->next - sp; 994 if (len >= sizeof(clname) - 1) { 995 SETERROR(REG_ECTYPE); 996 return; 997 } 998 memcpy(clname, sp, len); 999 clname[len] = '\0'; 1000 if ((wct = wctype(clname)) == 0) { 1001 SETERROR(REG_ECTYPE); 1002 return; 1003 } 1004 CHaddtype(p, cs, wct); 1005 } 1006 1007 /* 1008 - p_b_eclass - parse an equivalence-class name and deal with it 1009 == static void p_b_eclass(struct parse *p, cset *cs); 1010 * 1011 * This implementation is incomplete. xxx 1012 */ 1013 static void 1014 p_b_eclass(struct parse *p, cset *cs) 1015 { 1016 wint_t c; 1017 1018 c = p_b_coll_elem(p, '='); 1019 CHadd(p, cs, c); 1020 } 1021 1022 /* 1023 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 1024 == static wint_t p_b_symbol(struct parse *p); 1025 */ 1026 static wint_t /* value of symbol */ 1027 p_b_symbol(struct parse *p) 1028 { 1029 wint_t value; 1030 1031 (void)REQUIRE(MORE(), REG_EBRACK); 1032 if (!EATTWO('[', '.')) 1033 return(WGETNEXT()); 1034 1035 /* collating symbol */ 1036 value = p_b_coll_elem(p, '.'); 1037 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 1038 return(value); 1039 } 1040 1041 /* 1042 - p_b_coll_elem - parse a collating-element name and look it up 1043 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 1044 */ 1045 static wint_t /* value of collating element */ 1046 p_b_coll_elem(struct parse *p, 1047 wint_t endc) /* name ended by endc,']' */ 1048 { 1049 const char *sp = p->next; 1050 struct cname *cp; 1051 mbstate_t mbs; 1052 wchar_t wc; 1053 size_t clen, len; 1054 1055 while (MORE() && !SEETWO(endc, ']')) 1056 NEXT(); 1057 if (!MORE()) { 1058 SETERROR(REG_EBRACK); 1059 return(0); 1060 } 1061 len = p->next - sp; 1062 for (cp = cnames; cp->name != NULL; cp++) 1063 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 1064 return(cp->code); /* known name */ 1065 memset(&mbs, 0, sizeof(mbs)); 1066 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 1067 return (wc); /* single character */ 1068 else if (clen == (size_t)-1 || clen == (size_t)-2) 1069 SETERROR(REG_ILLSEQ); 1070 else 1071 SETERROR(REG_ECOLLATE); /* neither */ 1072 return(0); 1073 } 1074 1075 /* 1076 - othercase - return the case counterpart of an alphabetic 1077 == static wint_t othercase(wint_t ch); 1078 */ 1079 static wint_t /* if no counterpart, return ch */ 1080 othercase(wint_t ch) 1081 { 1082 assert(iswalpha(ch)); 1083 if (iswupper(ch)) 1084 return(towlower(ch)); 1085 else if (iswlower(ch)) 1086 return(towupper(ch)); 1087 else /* peculiar, but could happen */ 1088 return(ch); 1089 } 1090 1091 /* 1092 - bothcases - emit a dualcase version of a two-case character 1093 == static void bothcases(struct parse *p, wint_t ch); 1094 * 1095 * Boy, is this implementation ever a kludge... 1096 */ 1097 static void 1098 bothcases(struct parse *p, wint_t ch) 1099 { 1100 const char *oldnext = p->next; 1101 const char *oldend = p->end; 1102 char bracket[3 + MB_LEN_MAX]; 1103 size_t n; 1104 mbstate_t mbs; 1105 1106 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1107 p->next = bracket; 1108 memset(&mbs, 0, sizeof(mbs)); 1109 n = wcrtomb(bracket, ch, &mbs); 1110 assert(n != (size_t)-1); 1111 bracket[n] = ']'; 1112 bracket[n + 1] = '\0'; 1113 p->end = bracket+n+1; 1114 p_bracket(p); 1115 assert(p->next == p->end); 1116 p->next = oldnext; 1117 p->end = oldend; 1118 } 1119 1120 /* 1121 - ordinary - emit an ordinary character 1122 == static void ordinary(struct parse *p, wint_t ch); 1123 */ 1124 static void 1125 ordinary(struct parse *p, wint_t ch) 1126 { 1127 cset *cs; 1128 1129 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 1130 bothcases(p, ch); 1131 else if ((ch & OPDMASK) == ch) 1132 EMIT(OCHAR, ch); 1133 else { 1134 /* 1135 * Kludge: character is too big to fit into an OCHAR operand. 1136 * Emit a singleton set. 1137 */ 1138 if ((cs = allocset(p)) == NULL) 1139 return; 1140 CHadd(p, cs, ch); 1141 EMIT(OANYOF, (int)(cs - p->g->sets)); 1142 } 1143 } 1144 1145 /* 1146 - nonnewline - emit REG_NEWLINE version of OANY 1147 == static void nonnewline(struct parse *p); 1148 * 1149 * Boy, is this implementation ever a kludge... 1150 */ 1151 static void 1152 nonnewline(struct parse *p) 1153 { 1154 const char *oldnext = p->next; 1155 const char *oldend = p->end; 1156 char bracket[4]; 1157 1158 p->next = bracket; 1159 p->end = bracket+3; 1160 bracket[0] = '^'; 1161 bracket[1] = '\n'; 1162 bracket[2] = ']'; 1163 bracket[3] = '\0'; 1164 p_bracket(p); 1165 assert(p->next == bracket+3); 1166 p->next = oldnext; 1167 p->end = oldend; 1168 } 1169 1170 /* 1171 - repeat - generate code for a bounded repetition, recursively if needed 1172 == static void repeat(struct parse *p, sopno start, int from, int to); 1173 */ 1174 static void 1175 repeat(struct parse *p, 1176 sopno start, /* operand from here to end of strip */ 1177 int from, /* repeated from this number */ 1178 int to) /* to this number of times (maybe INFINITY) */ 1179 { 1180 sopno finish = HERE(); 1181 # define N 2 1182 # define INF 3 1183 # define REP(f, t) ((f)*8 + (t)) 1184 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1185 sopno copy; 1186 1187 if (p->error != 0) /* head off possible runaway recursion */ 1188 return; 1189 1190 assert(from <= to); 1191 1192 switch (REP(MAP(from), MAP(to))) { 1193 case REP(0, 0): /* must be user doing this */ 1194 DROP(finish-start); /* drop the operand */ 1195 break; 1196 case REP(0, 1): /* as x{1,1}? */ 1197 case REP(0, N): /* as x{1,n}? */ 1198 case REP(0, INF): /* as x{1,}? */ 1199 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1200 INSERT(OCH_, start); /* offset is wrong... */ 1201 repeat(p, start+1, 1, to); 1202 ASTERN(OOR1, start); 1203 AHEAD(start); /* ... fix it */ 1204 EMIT(OOR2, 0); 1205 AHEAD(THERE()); 1206 ASTERN(O_CH, THERETHERE()); 1207 break; 1208 case REP(1, 1): /* trivial case */ 1209 /* done */ 1210 break; 1211 case REP(1, N): /* as x?x{1,n-1} */ 1212 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1213 INSERT(OCH_, start); 1214 ASTERN(OOR1, start); 1215 AHEAD(start); 1216 EMIT(OOR2, 0); /* offset very wrong... */ 1217 AHEAD(THERE()); /* ...so fix it */ 1218 ASTERN(O_CH, THERETHERE()); 1219 copy = dupl(p, start+1, finish+1); 1220 assert(copy == finish+4); 1221 repeat(p, copy, 1, to-1); 1222 break; 1223 case REP(1, INF): /* as x+ */ 1224 INSERT(OPLUS_, start); 1225 ASTERN(O_PLUS, start); 1226 break; 1227 case REP(N, N): /* as xx{m-1,n-1} */ 1228 copy = dupl(p, start, finish); 1229 repeat(p, copy, from-1, to-1); 1230 break; 1231 case REP(N, INF): /* as xx{n-1,INF} */ 1232 copy = dupl(p, start, finish); 1233 repeat(p, copy, from-1, to); 1234 break; 1235 default: /* "can't happen" */ 1236 SETERROR(REG_ASSERT); /* just in case */ 1237 break; 1238 } 1239 } 1240 1241 /* 1242 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1243 - character from the parse struct, signals a REG_ILLSEQ error if the 1244 - character can't be converted. Returns the number of bytes consumed. 1245 */ 1246 static wint_t 1247 wgetnext(struct parse *p) 1248 { 1249 mbstate_t mbs; 1250 wchar_t wc; 1251 size_t n; 1252 1253 memset(&mbs, 0, sizeof(mbs)); 1254 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1255 if (n == (size_t)-1 || n == (size_t)-2) { 1256 SETERROR(REG_ILLSEQ); 1257 return (0); 1258 } 1259 if (n == 0) 1260 n = 1; 1261 p->next += n; 1262 return (wc); 1263 } 1264 1265 /* 1266 - seterr - set an error condition 1267 == static int seterr(struct parse *p, int e); 1268 */ 1269 static int /* useless but makes type checking happy */ 1270 seterr(struct parse *p, int e) 1271 { 1272 if (p->error == 0) /* keep earliest error condition */ 1273 p->error = e; 1274 p->next = nuls; /* try to bring things to a halt */ 1275 p->end = nuls; 1276 return(0); /* make the return value well-defined */ 1277 } 1278 1279 /* 1280 - allocset - allocate a set of characters for [] 1281 == static cset *allocset(struct parse *p); 1282 */ 1283 static cset * 1284 allocset(struct parse *p) 1285 { 1286 cset *cs, *ncs; 1287 1288 ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs)); 1289 if (ncs == NULL) { 1290 SETERROR(REG_ESPACE); 1291 return (NULL); 1292 } 1293 p->g->sets = ncs; 1294 cs = &p->g->sets[p->g->ncsets++]; 1295 memset(cs, 0, sizeof(*cs)); 1296 1297 return(cs); 1298 } 1299 1300 /* 1301 - freeset - free a now-unused set 1302 == static void freeset(struct parse *p, cset *cs); 1303 */ 1304 static void 1305 freeset(struct parse *p, cset *cs) 1306 { 1307 cset *top = &p->g->sets[p->g->ncsets]; 1308 1309 free(cs->wides); 1310 free(cs->ranges); 1311 free(cs->types); 1312 memset(cs, 0, sizeof(*cs)); 1313 if (cs == top-1) /* recover only the easy case */ 1314 p->g->ncsets--; 1315 } 1316 1317 /* 1318 - singleton - Determine whether a set contains only one character, 1319 - returning it if so, otherwise returning OUT. 1320 */ 1321 static wint_t 1322 singleton(cset *cs) 1323 { 1324 wint_t i, s, n; 1325 1326 for (i = n = 0; i < NC; i++) 1327 if (CHIN(cs, i)) { 1328 n++; 1329 s = i; 1330 } 1331 if (n == 1) 1332 return (s); 1333 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1334 cs->icase == 0) 1335 return (cs->wides[0]); 1336 /* Don't bother handling the other cases. */ 1337 return (OUT); 1338 } 1339 1340 /* 1341 - CHadd - add character to character set. 1342 */ 1343 static void 1344 CHadd(struct parse *p, cset *cs, wint_t ch) 1345 { 1346 wint_t nch, *newwides; 1347 assert(ch >= 0); 1348 if (ch < NC) 1349 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1350 else { 1351 newwides = reallocarray(cs->wides, cs->nwides + 1, 1352 sizeof(*cs->wides)); 1353 if (newwides == NULL) { 1354 SETERROR(REG_ESPACE); 1355 return; 1356 } 1357 cs->wides = newwides; 1358 cs->wides[cs->nwides++] = ch; 1359 } 1360 if (cs->icase) { 1361 if ((nch = towlower(ch)) < NC) 1362 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1363 if ((nch = towupper(ch)) < NC) 1364 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1365 } 1366 } 1367 1368 /* 1369 - CHaddrange - add all characters in the range [min,max] to a character set. 1370 */ 1371 static void 1372 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1373 { 1374 crange *newranges; 1375 1376 for (; min < NC && min <= max; min++) 1377 CHadd(p, cs, min); 1378 if (min >= max) 1379 return; 1380 newranges = reallocarray(cs->ranges, cs->nranges + 1, 1381 sizeof(*cs->ranges)); 1382 if (newranges == NULL) { 1383 SETERROR(REG_ESPACE); 1384 return; 1385 } 1386 cs->ranges = newranges; 1387 cs->ranges[cs->nranges].min = min; 1388 cs->ranges[cs->nranges].max = max; 1389 cs->nranges++; 1390 } 1391 1392 /* 1393 - CHaddtype - add all characters of a certain type to a character set. 1394 */ 1395 static void 1396 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1397 { 1398 wint_t i; 1399 wctype_t *newtypes; 1400 1401 for (i = 0; i < NC; i++) 1402 if (iswctype(i, wct)) 1403 CHadd(p, cs, i); 1404 newtypes = reallocarray(cs->types, cs->ntypes + 1, 1405 sizeof(*cs->types)); 1406 if (newtypes == NULL) { 1407 SETERROR(REG_ESPACE); 1408 return; 1409 } 1410 cs->types = newtypes; 1411 cs->types[cs->ntypes++] = wct; 1412 } 1413 1414 /* 1415 - dupl - emit a duplicate of a bunch of sops 1416 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1417 */ 1418 static sopno /* start of duplicate */ 1419 dupl(struct parse *p, 1420 sopno start, /* from here */ 1421 sopno finish) /* to this less one */ 1422 { 1423 sopno ret = HERE(); 1424 sopno len = finish - start; 1425 1426 assert(finish >= start); 1427 if (len == 0) 1428 return(ret); 1429 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1430 return(ret); 1431 (void) memcpy((char *)(p->strip + p->slen), 1432 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1433 p->slen += len; 1434 return(ret); 1435 } 1436 1437 /* 1438 - doemit - emit a strip operator 1439 == static void doemit(struct parse *p, sop op, size_t opnd); 1440 * 1441 * It might seem better to implement this as a macro with a function as 1442 * hard-case backup, but it's just too big and messy unless there are 1443 * some changes to the data structures. Maybe later. 1444 */ 1445 static void 1446 doemit(struct parse *p, sop op, size_t opnd) 1447 { 1448 /* avoid making error situations worse */ 1449 if (p->error != 0) 1450 return; 1451 1452 /* deal with oversize operands ("can't happen", more or less) */ 1453 assert(opnd < 1<<OPSHIFT); 1454 1455 /* deal with undersized strip */ 1456 if (p->slen >= p->ssize) 1457 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1458 return; 1459 1460 /* finally, it's all reduced to the easy case */ 1461 p->strip[p->slen++] = SOP(op, opnd); 1462 } 1463 1464 /* 1465 - doinsert - insert a sop into the strip 1466 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1467 */ 1468 static void 1469 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1470 { 1471 sopno sn; 1472 sop s; 1473 int i; 1474 1475 /* avoid making error situations worse */ 1476 if (p->error != 0) 1477 return; 1478 1479 sn = HERE(); 1480 EMIT(op, opnd); /* do checks, ensure space */ 1481 assert(HERE() == sn+1); 1482 s = p->strip[sn]; 1483 1484 /* adjust paren pointers */ 1485 assert(pos > 0); 1486 for (i = 1; i < NPAREN; i++) { 1487 if (p->pbegin[i] >= pos) { 1488 p->pbegin[i]++; 1489 } 1490 if (p->pend[i] >= pos) { 1491 p->pend[i]++; 1492 } 1493 } 1494 1495 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1496 (HERE()-pos-1)*sizeof(sop)); 1497 p->strip[pos] = s; 1498 } 1499 1500 /* 1501 - dofwd - complete a forward reference 1502 == static void dofwd(struct parse *p, sopno pos, sop value); 1503 */ 1504 static void 1505 dofwd(struct parse *p, sopno pos, sop value) 1506 { 1507 /* avoid making error situations worse */ 1508 if (p->error != 0) 1509 return; 1510 1511 assert(value < 1<<OPSHIFT); 1512 p->strip[pos] = OP(p->strip[pos]) | value; 1513 } 1514 1515 /* 1516 - enlarge - enlarge the strip 1517 == static int enlarge(struct parse *p, sopno size); 1518 */ 1519 static int 1520 enlarge(struct parse *p, sopno size) 1521 { 1522 sop *sp; 1523 1524 if (p->ssize >= size) 1525 return 1; 1526 1527 sp = reallocarray(p->strip, size, sizeof(sop)); 1528 if (sp == NULL) { 1529 SETERROR(REG_ESPACE); 1530 return 0; 1531 } 1532 p->strip = sp; 1533 p->ssize = size; 1534 return 1; 1535 } 1536 1537 /* 1538 - stripsnug - compact the strip 1539 == static void stripsnug(struct parse *p, struct re_guts *g); 1540 */ 1541 static void 1542 stripsnug(struct parse *p, struct re_guts *g) 1543 { 1544 g->nstates = p->slen; 1545 g->strip = reallocarray((char *)p->strip, p->slen, sizeof(sop)); 1546 if (g->strip == NULL) { 1547 SETERROR(REG_ESPACE); 1548 g->strip = p->strip; 1549 } 1550 } 1551 1552 /* 1553 - findmust - fill in must and mlen with longest mandatory literal string 1554 == static void findmust(struct parse *p, struct re_guts *g); 1555 * 1556 * This algorithm could do fancy things like analyzing the operands of | 1557 * for common subsequences. Someday. This code is simple and finds most 1558 * of the interesting cases. 1559 * 1560 * Note that must and mlen got initialized during setup. 1561 */ 1562 static void 1563 findmust(struct parse *p, struct re_guts *g) 1564 { 1565 sop *scan; 1566 sop *start = NULL; 1567 sop *newstart = NULL; 1568 sopno newlen; 1569 sop s; 1570 char *cp; 1571 int offset; 1572 char buf[MB_LEN_MAX]; 1573 size_t clen; 1574 mbstate_t mbs; 1575 1576 /* avoid making error situations worse */ 1577 if (p->error != 0) 1578 return; 1579 1580 /* 1581 * It's not generally safe to do a ``char'' substring search on 1582 * multibyte character strings, but it's safe for at least 1583 * UTF-8 (see RFC 3629). 1584 */ 1585 if (MB_CUR_MAX > 1 && 1586 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1587 return; 1588 1589 /* find the longest OCHAR sequence in strip */ 1590 newlen = 0; 1591 offset = 0; 1592 g->moffset = 0; 1593 scan = g->strip + 1; 1594 do { 1595 s = *scan++; 1596 switch (OP(s)) { 1597 case OCHAR: /* sequence member */ 1598 if (newlen == 0) { /* new sequence */ 1599 memset(&mbs, 0, sizeof(mbs)); 1600 newstart = scan - 1; 1601 } 1602 clen = wcrtomb(buf, OPND(s), &mbs); 1603 if (clen == (size_t)-1) 1604 goto toohard; 1605 newlen += clen; 1606 break; 1607 case OPLUS_: /* things that don't break one */ 1608 case OLPAREN: 1609 case ORPAREN: 1610 break; 1611 case OQUEST_: /* things that must be skipped */ 1612 case OCH_: 1613 offset = altoffset(scan, offset); 1614 scan--; 1615 do { 1616 scan += OPND(s); 1617 s = *scan; 1618 /* assert() interferes w debug printouts */ 1619 if (OP(s) != O_QUEST && OP(s) != O_CH && 1620 OP(s) != OOR2) { 1621 g->iflags |= BAD; 1622 return; 1623 } 1624 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1625 /* FALLTHROUGH */ 1626 case OBOW: /* things that break a sequence */ 1627 case OEOW: 1628 case OBOL: 1629 case OEOL: 1630 case O_QUEST: 1631 case O_CH: 1632 case OEND: 1633 if (newlen > g->mlen) { /* ends one */ 1634 start = newstart; 1635 g->mlen = newlen; 1636 if (offset > -1) { 1637 g->moffset += offset; 1638 offset = newlen; 1639 } else 1640 g->moffset = offset; 1641 } else { 1642 if (offset > -1) 1643 offset += newlen; 1644 } 1645 newlen = 0; 1646 break; 1647 case OANY: 1648 if (newlen > g->mlen) { /* ends one */ 1649 start = newstart; 1650 g->mlen = newlen; 1651 if (offset > -1) { 1652 g->moffset += offset; 1653 offset = newlen; 1654 } else 1655 g->moffset = offset; 1656 } else { 1657 if (offset > -1) 1658 offset += newlen; 1659 } 1660 if (offset > -1) 1661 offset++; 1662 newlen = 0; 1663 break; 1664 case OANYOF: /* may or may not invalidate offset */ 1665 /* First, everything as OANY */ 1666 if (newlen > g->mlen) { /* ends one */ 1667 start = newstart; 1668 g->mlen = newlen; 1669 if (offset > -1) { 1670 g->moffset += offset; 1671 offset = newlen; 1672 } else 1673 g->moffset = offset; 1674 } else { 1675 if (offset > -1) 1676 offset += newlen; 1677 } 1678 if (offset > -1) 1679 offset++; 1680 newlen = 0; 1681 break; 1682 toohard: 1683 default: 1684 /* Anything here makes it impossible or too hard 1685 * to calculate the offset -- so we give up; 1686 * save the last known good offset, in case the 1687 * must sequence doesn't occur later. 1688 */ 1689 if (newlen > g->mlen) { /* ends one */ 1690 start = newstart; 1691 g->mlen = newlen; 1692 if (offset > -1) 1693 g->moffset += offset; 1694 else 1695 g->moffset = offset; 1696 } 1697 offset = -1; 1698 newlen = 0; 1699 break; 1700 } 1701 } while (OP(s) != OEND); 1702 1703 if (g->mlen == 0) { /* there isn't one */ 1704 g->moffset = -1; 1705 return; 1706 } 1707 1708 /* turn it into a character string */ 1709 g->must = malloc((size_t)g->mlen + 1); 1710 if (g->must == NULL) { /* argh; just forget it */ 1711 g->mlen = 0; 1712 g->moffset = -1; 1713 return; 1714 } 1715 cp = g->must; 1716 scan = start; 1717 memset(&mbs, 0, sizeof(mbs)); 1718 while (cp < g->must + g->mlen) { 1719 while (OP(s = *scan++) != OCHAR) 1720 continue; 1721 clen = wcrtomb(cp, OPND(s), &mbs); 1722 assert(clen != (size_t)-1); 1723 cp += clen; 1724 } 1725 assert(cp == g->must + g->mlen); 1726 *cp++ = '\0'; /* just on general principles */ 1727 } 1728 1729 /* 1730 - altoffset - choose biggest offset among multiple choices 1731 == static int altoffset(sop *scan, int offset); 1732 * 1733 * Compute, recursively if necessary, the largest offset among multiple 1734 * re paths. 1735 */ 1736 static int 1737 altoffset(sop *scan, int offset) 1738 { 1739 int largest; 1740 int try; 1741 sop s; 1742 1743 /* If we gave up already on offsets, return */ 1744 if (offset == -1) 1745 return -1; 1746 1747 largest = 0; 1748 try = 0; 1749 s = *scan++; 1750 while (OP(s) != O_QUEST && OP(s) != O_CH) { 1751 switch (OP(s)) { 1752 case OOR1: 1753 if (try > largest) 1754 largest = try; 1755 try = 0; 1756 break; 1757 case OQUEST_: 1758 case OCH_: 1759 try = altoffset(scan, try); 1760 if (try == -1) 1761 return -1; 1762 scan--; 1763 do { 1764 scan += OPND(s); 1765 s = *scan; 1766 if (OP(s) != O_QUEST && OP(s) != O_CH && 1767 OP(s) != OOR2) 1768 return -1; 1769 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1770 /* We must skip to the next position, or we'll 1771 * leave altoffset() too early. 1772 */ 1773 scan++; 1774 break; 1775 case OANYOF: 1776 case OCHAR: 1777 case OANY: 1778 try++; 1779 case OBOW: 1780 case OEOW: 1781 case OLPAREN: 1782 case ORPAREN: 1783 case OOR2: 1784 break; 1785 default: 1786 try = -1; 1787 break; 1788 } 1789 if (try == -1) 1790 return -1; 1791 s = *scan++; 1792 } 1793 1794 if (try > largest) 1795 largest = try; 1796 1797 return largest+offset; 1798 } 1799 1800 /* 1801 - computejumps - compute char jumps for BM scan 1802 == static void computejumps(struct parse *p, struct re_guts *g); 1803 * 1804 * This algorithm assumes g->must exists and is has size greater than 1805 * zero. It's based on the algorithm found on Computer Algorithms by 1806 * Sara Baase. 1807 * 1808 * A char jump is the number of characters one needs to jump based on 1809 * the value of the character from the text that was mismatched. 1810 */ 1811 static void 1812 computejumps(struct parse *p, struct re_guts *g) 1813 { 1814 int ch; 1815 int mindex; 1816 1817 /* Avoid making errors worse */ 1818 if (p->error != 0) 1819 return; 1820 1821 g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 1822 if (g->charjump == NULL) /* Not a fatal error */ 1823 return; 1824 /* Adjust for signed chars, if necessary */ 1825 g->charjump = &g->charjump[-(CHAR_MIN)]; 1826 1827 /* If the character does not exist in the pattern, the jump 1828 * is equal to the number of characters in the pattern. 1829 */ 1830 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 1831 g->charjump[ch] = g->mlen; 1832 1833 /* If the character does exist, compute the jump that would 1834 * take us to the last character in the pattern equal to it 1835 * (notice that we match right to left, so that last character 1836 * is the first one that would be matched). 1837 */ 1838 for (mindex = 0; mindex < g->mlen; mindex++) 1839 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 1840 } 1841 1842 /* 1843 - computematchjumps - compute match jumps for BM scan 1844 == static void computematchjumps(struct parse *p, struct re_guts *g); 1845 * 1846 * This algorithm assumes g->must exists and is has size greater than 1847 * zero. It's based on the algorithm found on Computer Algorithms by 1848 * Sara Baase. 1849 * 1850 * A match jump is the number of characters one needs to advance based 1851 * on the already-matched suffix. 1852 * Notice that all values here are minus (g->mlen-1), because of the way 1853 * the search algorithm works. 1854 */ 1855 static void 1856 computematchjumps(struct parse *p, struct re_guts *g) 1857 { 1858 int mindex; /* General "must" iterator */ 1859 int suffix; /* Keeps track of matching suffix */ 1860 int ssuffix; /* Keeps track of suffixes' suffix */ 1861 int* pmatches; /* pmatches[k] points to the next i 1862 * such that i+1...mlen is a substring 1863 * of k+1...k+mlen-i-1 1864 */ 1865 1866 /* Avoid making errors worse */ 1867 if (p->error != 0) 1868 return; 1869 1870 pmatches = (int*) malloc(g->mlen * sizeof(int)); 1871 if (pmatches == NULL) { 1872 g->matchjump = NULL; 1873 return; 1874 } 1875 1876 g->matchjump = (int*) malloc(g->mlen * sizeof(int)); 1877 if (g->matchjump == NULL) { /* Not a fatal error */ 1878 free(pmatches); 1879 return; 1880 } 1881 1882 /* Set maximum possible jump for each character in the pattern */ 1883 for (mindex = 0; mindex < g->mlen; mindex++) 1884 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 1885 1886 /* Compute pmatches[] */ 1887 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 1888 mindex--, suffix--) { 1889 pmatches[mindex] = suffix; 1890 1891 /* If a mismatch is found, interrupting the substring, 1892 * compute the matchjump for that position. If no 1893 * mismatch is found, then a text substring mismatched 1894 * against the suffix will also mismatch against the 1895 * substring. 1896 */ 1897 while (suffix < g->mlen 1898 && g->must[mindex] != g->must[suffix]) { 1899 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1900 g->mlen - mindex - 1); 1901 suffix = pmatches[suffix]; 1902 } 1903 } 1904 1905 /* Compute the matchjump up to the last substring found to jump 1906 * to the beginning of the largest must pattern prefix matching 1907 * it's own suffix. 1908 */ 1909 for (mindex = 0; mindex <= suffix; mindex++) 1910 g->matchjump[mindex] = MIN(g->matchjump[mindex], 1911 g->mlen + suffix - mindex); 1912 1913 ssuffix = pmatches[suffix]; 1914 while (suffix < g->mlen) { 1915 while (suffix <= ssuffix && suffix < g->mlen) { 1916 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1917 g->mlen + ssuffix - suffix); 1918 suffix++; 1919 } 1920 if (suffix < g->mlen) 1921 ssuffix = pmatches[ssuffix]; 1922 } 1923 1924 free(pmatches); 1925 } 1926 1927 /* 1928 - pluscount - count + nesting 1929 == static sopno pluscount(struct parse *p, struct re_guts *g); 1930 */ 1931 static sopno /* nesting depth */ 1932 pluscount(struct parse *p, struct re_guts *g) 1933 { 1934 sop *scan; 1935 sop s; 1936 sopno plusnest = 0; 1937 sopno maxnest = 0; 1938 1939 if (p->error != 0) 1940 return(0); /* there may not be an OEND */ 1941 1942 scan = g->strip + 1; 1943 do { 1944 s = *scan++; 1945 switch (OP(s)) { 1946 case OPLUS_: 1947 plusnest++; 1948 break; 1949 case O_PLUS: 1950 if (plusnest > maxnest) 1951 maxnest = plusnest; 1952 plusnest--; 1953 break; 1954 } 1955 } while (OP(s) != OEND); 1956 if (plusnest != 0) 1957 g->iflags |= BAD; 1958 return(maxnest); 1959 } 1960