1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Copyright (c) 2011 The FreeBSD Foundation 9 * 10 * Portions of this software were developed by David Chisnall 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * Henry Spencer. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 */ 40 41 #include <sys/types.h> 42 #include <stdio.h> 43 #include <string.h> 44 #include <ctype.h> 45 #include <limits.h> 46 #include <stdlib.h> 47 #include <regex.h> 48 #include <stdbool.h> 49 #include <wchar.h> 50 #include <wctype.h> 51 52 #ifndef LIBREGEX 53 #include "collate.h" 54 #endif 55 56 #include "utils.h" 57 #include "regex2.h" 58 59 #include "cname.h" 60 61 /* 62 * Branching context, used to keep track of branch state for all of the branch- 63 * aware functions. In addition to keeping track of branch positions for the 64 * p_branch_* functions, we use this to simplify some clumsiness in BREs for 65 * detection of whether ^ is acting as an anchor or being used erroneously and 66 * also for whether we're in a sub-expression or not. 67 */ 68 struct branchc { 69 sopno start; 70 sopno back; 71 sopno fwd; 72 73 int nbranch; 74 int nchain; 75 bool outer; 76 bool terminate; 77 }; 78 79 /* 80 * parse structure, passed up and down to avoid global variables and 81 * other clumsinesses 82 */ 83 struct parse { 84 const char *next; /* next character in RE */ 85 const char *end; /* end of string (-> NUL normally) */ 86 int error; /* has an error been seen? */ 87 int gnuext; 88 sop *strip; /* malloced strip */ 89 sopno ssize; /* malloced strip size (allocated) */ 90 sopno slen; /* malloced strip length (used) */ 91 int ncsalloc; /* number of csets allocated */ 92 struct re_guts *g; 93 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 94 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 95 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 96 bool allowbranch; /* can this expression branch? */ 97 bool bre; /* convenience; is this a BRE? */ 98 int pflags; /* other parsing flags -- legacy escapes? */ 99 bool (*parse_expr)(struct parse *, struct branchc *); 100 void (*pre_parse)(struct parse *, struct branchc *); 101 void (*post_parse)(struct parse *, struct branchc *); 102 }; 103 104 #define PFLAG_LEGACY_ESC 0x00000001 105 106 /* ========= begin header generated by ./mkh ========= */ 107 #ifdef __cplusplus 108 extern "C" { 109 #endif 110 111 /* === regcomp.c === */ 112 static bool p_ere_exp(struct parse *p, struct branchc *bc); 113 static void p_str(struct parse *p); 114 static int p_branch_eat_delim(struct parse *p, struct branchc *bc); 115 static void p_branch_ins_offset(struct parse *p, struct branchc *bc); 116 static void p_branch_fix_tail(struct parse *p, struct branchc *bc); 117 static bool p_branch_empty(struct parse *p, struct branchc *bc); 118 static bool p_branch_do(struct parse *p, struct branchc *bc); 119 static void p_bre_pre_parse(struct parse *p, struct branchc *bc); 120 static void p_bre_post_parse(struct parse *p, struct branchc *bc); 121 static void p_re(struct parse *p, int end1, int end2); 122 static bool p_simp_re(struct parse *p, struct branchc *bc); 123 static int p_count(struct parse *p); 124 static void p_bracket(struct parse *p); 125 static int p_range_cmp(wchar_t c1, wchar_t c2); 126 static void p_b_term(struct parse *p, cset *cs); 127 static int p_b_pseudoclass(struct parse *p, char c); 128 static void p_b_cclass(struct parse *p, cset *cs); 129 static void p_b_cclass_named(struct parse *p, cset *cs, const char[]); 130 static void p_b_eclass(struct parse *p, cset *cs); 131 static wint_t p_b_symbol(struct parse *p); 132 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 133 static bool may_escape(struct parse *p, const wint_t ch); 134 static wint_t othercase(wint_t ch); 135 static void bothcases(struct parse *p, wint_t ch); 136 static void ordinary(struct parse *p, wint_t ch); 137 static void nonnewline(struct parse *p); 138 static void repeat(struct parse *p, sopno start, int from, int to); 139 static int seterr(struct parse *p, int e); 140 static cset *allocset(struct parse *p); 141 static void freeset(struct parse *p, cset *cs); 142 static void CHadd(struct parse *p, cset *cs, wint_t ch); 143 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 144 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 145 static wint_t singleton(cset *cs); 146 static sopno dupl(struct parse *p, sopno start, sopno finish); 147 static void doemit(struct parse *p, sop op, size_t opnd); 148 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 149 static void dofwd(struct parse *p, sopno pos, sop value); 150 static int enlarge(struct parse *p, sopno size); 151 static void stripsnug(struct parse *p, struct re_guts *g); 152 static void findmust(struct parse *p, struct re_guts *g); 153 static int altoffset(sop *scan, int offset); 154 static void computejumps(struct parse *p, struct re_guts *g); 155 static void computematchjumps(struct parse *p, struct re_guts *g); 156 static sopno pluscount(struct parse *p, struct re_guts *g); 157 static wint_t wgetnext(struct parse *p); 158 159 #ifdef __cplusplus 160 } 161 #endif 162 /* ========= end header generated by ./mkh ========= */ 163 164 static char nuls[10]; /* place to point scanner in event of error */ 165 166 /* 167 * macros for use with parse structure 168 * BEWARE: these know that the parse structure is named `p' !!! 169 */ 170 #define PEEK() (*p->next) 171 #define PEEK2() (*(p->next+1)) 172 #define MORE() (p->end - p->next > 0) 173 #define MORE2() (p->end - p->next > 1) 174 #define SEE(c) (MORE() && PEEK() == (c)) 175 #define SEETWO(a, b) (MORE2() && PEEK() == (a) && PEEK2() == (b)) 176 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) 177 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 178 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 179 #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a)) 180 #define NEXT() (p->next++) 181 #define NEXT2() (p->next += 2) 182 #define NEXTn(n) (p->next += (n)) 183 #define GETNEXT() (*p->next++) 184 #define WGETNEXT() wgetnext(p) 185 #define SETERROR(e) seterr(p, (e)) 186 #define REQUIRE(co, e) ((co) || SETERROR(e)) 187 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 188 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 189 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 190 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 191 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 192 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 193 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 194 #define HERE() (p->slen) 195 #define THERE() (p->slen - 1) 196 #define THERETHERE() (p->slen - 2) 197 #define DROP(n) (p->slen -= (n)) 198 199 /* Macro used by computejump()/computematchjump() */ 200 #define MIN(a,b) ((a)<(b)?(a):(b)) 201 202 static int /* 0 success, otherwise REG_something */ 203 regcomp_internal(regex_t * __restrict preg, 204 const char * __restrict pattern, 205 int cflags, int pflags) 206 { 207 struct parse pa; 208 struct re_guts *g; 209 struct parse *p = &pa; 210 int i; 211 size_t len; 212 size_t maxlen; 213 #ifdef REDEBUG 214 # define GOODFLAGS(f) (f) 215 #else 216 # define GOODFLAGS(f) ((f)&~REG_DUMP) 217 #endif 218 219 cflags = GOODFLAGS(cflags); 220 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 221 return(REG_INVARG); 222 223 if (cflags®_PEND) { 224 if (preg->re_endp < pattern) 225 return(REG_INVARG); 226 len = preg->re_endp - pattern; 227 } else 228 len = strlen(pattern); 229 230 /* do the mallocs early so failure handling is easy */ 231 g = (struct re_guts *)malloc(sizeof(struct re_guts)); 232 if (g == NULL) 233 return(REG_ESPACE); 234 /* 235 * Limit the pattern space to avoid a 32-bit overflow on buffer 236 * extension. Also avoid any signed overflow in case of conversion 237 * so make the real limit based on a 31-bit overflow. 238 * 239 * Likely not applicable on 64-bit systems but handle the case 240 * generically (who are we to stop people from using ~715MB+ 241 * patterns?). 242 */ 243 maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3; 244 if (len >= maxlen) { 245 free((char *)g); 246 return(REG_ESPACE); 247 } 248 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 249 assert(p->ssize >= len); 250 251 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 252 p->slen = 0; 253 if (p->strip == NULL) { 254 free((char *)g); 255 return(REG_ESPACE); 256 } 257 258 /* set things up */ 259 p->g = g; 260 p->next = pattern; /* convenience; we do not modify it */ 261 p->end = p->next + len; 262 p->error = 0; 263 p->ncsalloc = 0; 264 p->pflags = pflags; 265 for (i = 0; i < NPAREN; i++) { 266 p->pbegin[i] = 0; 267 p->pend[i] = 0; 268 } 269 #ifdef LIBREGEX 270 if (cflags®_POSIX) { 271 p->gnuext = false; 272 p->allowbranch = (cflags & REG_EXTENDED) != 0; 273 } else 274 p->gnuext = p->allowbranch = true; 275 #else 276 p->gnuext = false; 277 p->allowbranch = (cflags & REG_EXTENDED) != 0; 278 #endif 279 if (cflags & REG_EXTENDED) { 280 p->bre = false; 281 p->parse_expr = p_ere_exp; 282 p->pre_parse = NULL; 283 p->post_parse = NULL; 284 } else { 285 p->bre = true; 286 p->parse_expr = p_simp_re; 287 p->pre_parse = p_bre_pre_parse; 288 p->post_parse = p_bre_post_parse; 289 } 290 g->sets = NULL; 291 g->ncsets = 0; 292 g->cflags = cflags; 293 g->iflags = 0; 294 g->nbol = 0; 295 g->neol = 0; 296 g->must = NULL; 297 g->moffset = -1; 298 g->charjump = NULL; 299 g->matchjump = NULL; 300 g->mlen = 0; 301 g->nsub = 0; 302 g->backrefs = 0; 303 304 /* do it */ 305 EMIT(OEND, 0); 306 g->firststate = THERE(); 307 if (cflags & REG_NOSPEC) 308 p_str(p); 309 else 310 p_re(p, OUT, OUT); 311 EMIT(OEND, 0); 312 g->laststate = THERE(); 313 314 /* tidy up loose ends and fill things in */ 315 stripsnug(p, g); 316 findmust(p, g); 317 /* only use Boyer-Moore algorithm if the pattern is bigger 318 * than three characters 319 */ 320 if(g->mlen > 3) { 321 computejumps(p, g); 322 computematchjumps(p, g); 323 if(g->matchjump == NULL && g->charjump != NULL) { 324 free(&g->charjump[CHAR_MIN]); 325 g->charjump = NULL; 326 } 327 } 328 g->nplus = pluscount(p, g); 329 g->magic = MAGIC2; 330 preg->re_nsub = g->nsub; 331 preg->re_g = g; 332 preg->re_magic = MAGIC1; 333 #ifndef REDEBUG 334 /* not debugging, so can't rely on the assert() in regexec() */ 335 if (g->iflags&BAD) 336 SETERROR(REG_ASSERT); 337 #endif 338 339 /* win or lose, we're done */ 340 if (p->error != 0) /* lose */ 341 regfree(preg); 342 return(p->error); 343 } 344 345 /* 346 - regcomp - interface for parser and compilation 347 = extern int regcomp(regex_t *, const char *, int); 348 = #define REG_BASIC 0000 349 = #define REG_EXTENDED 0001 350 = #define REG_ICASE 0002 351 = #define REG_NOSUB 0004 352 = #define REG_NEWLINE 0010 353 = #define REG_NOSPEC 0020 354 = #define REG_PEND 0040 355 = #define REG_DUMP 0200 356 */ 357 int /* 0 success, otherwise REG_something */ 358 regcomp(regex_t * __restrict preg, 359 const char * __restrict pattern, 360 int cflags) 361 { 362 363 return (regcomp_internal(preg, pattern, cflags, 0)); 364 } 365 366 #ifndef LIBREGEX 367 /* 368 * Legacy interface that requires more lax escaping behavior. 369 */ 370 int 371 freebsd12_regcomp(regex_t * __restrict preg, 372 const char * __restrict pattern, 373 int cflags, int pflags) 374 { 375 376 return (regcomp_internal(preg, pattern, cflags, PFLAG_LEGACY_ESC)); 377 } 378 379 __sym_compat(regcomp, freebsd12_regcomp, FBSD_1.0); 380 #endif /* !LIBREGEX */ 381 382 /* 383 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, 384 - return whether we should terminate or not 385 == static bool p_ere_exp(struct parse *p); 386 */ 387 static bool 388 p_ere_exp(struct parse *p, struct branchc *bc) 389 { 390 char c; 391 wint_t wc; 392 sopno pos; 393 int count; 394 int count2; 395 #ifdef LIBREGEX 396 int i; 397 int handled; 398 #endif 399 sopno subno; 400 int wascaret = 0; 401 402 (void)bc; 403 assert(MORE()); /* caller should have ensured this */ 404 c = GETNEXT(); 405 406 #ifdef LIBREGEX 407 handled = 0; 408 #endif 409 pos = HERE(); 410 switch (c) { 411 case '(': 412 (void)REQUIRE(MORE(), REG_EPAREN); 413 p->g->nsub++; 414 subno = p->g->nsub; 415 if (subno < NPAREN) 416 p->pbegin[subno] = HERE(); 417 EMIT(OLPAREN, subno); 418 if (!SEE(')')) 419 p_re(p, ')', IGN); 420 if (subno < NPAREN) { 421 p->pend[subno] = HERE(); 422 assert(p->pend[subno] != 0); 423 } 424 EMIT(ORPAREN, subno); 425 (void)MUSTEAT(')', REG_EPAREN); 426 break; 427 #ifndef POSIX_MISTAKE 428 case ')': /* happens only if no current unmatched ( */ 429 /* 430 * You may ask, why the ifndef? Because I didn't notice 431 * this until slightly too late for 1003.2, and none of the 432 * other 1003.2 regular-expression reviewers noticed it at 433 * all. So an unmatched ) is legal POSIX, at least until 434 * we can get it fixed. 435 */ 436 SETERROR(REG_EPAREN); 437 break; 438 #endif 439 case '^': 440 EMIT(OBOL, 0); 441 p->g->iflags |= USEBOL; 442 p->g->nbol++; 443 wascaret = 1; 444 break; 445 case '$': 446 EMIT(OEOL, 0); 447 p->g->iflags |= USEEOL; 448 p->g->neol++; 449 break; 450 case '|': 451 SETERROR(REG_EMPTY); 452 break; 453 case '*': 454 case '+': 455 case '?': 456 #ifndef NO_STRICT_REGEX 457 case '{': 458 #endif 459 SETERROR(REG_BADRPT); 460 break; 461 case '.': 462 if (p->g->cflags®_NEWLINE) 463 nonnewline(p); 464 else 465 EMIT(OANY, 0); 466 break; 467 case '[': 468 p_bracket(p); 469 break; 470 case '\\': 471 (void)REQUIRE(MORE(), REG_EESCAPE); 472 wc = WGETNEXT(); 473 #ifdef LIBREGEX 474 if (p->gnuext) { 475 handled = 1; 476 switch (wc) { 477 case '`': 478 EMIT(OBOS, 0); 479 break; 480 case '\'': 481 EMIT(OEOS, 0); 482 break; 483 case 'B': 484 EMIT(ONWBND, 0); 485 break; 486 case 'b': 487 EMIT(OWBND, 0); 488 break; 489 case 'W': 490 case 'w': 491 case 'S': 492 case 's': 493 p_b_pseudoclass(p, wc); 494 break; 495 case '1': 496 case '2': 497 case '3': 498 case '4': 499 case '5': 500 case '6': 501 case '7': 502 case '8': 503 case '9': 504 i = wc - '0'; 505 assert(i < NPAREN); 506 if (p->pend[i] != 0) { 507 assert(i <= p->g->nsub); 508 EMIT(OBACK_, i); 509 assert(p->pbegin[i] != 0); 510 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 511 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 512 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 513 EMIT(O_BACK, i); 514 } else 515 SETERROR(REG_ESUBREG); 516 p->g->backrefs = 1; 517 break; 518 default: 519 handled = 0; 520 } 521 /* Don't proceed to the POSIX bits if we've already handled it */ 522 if (handled) 523 break; 524 } 525 #endif 526 switch (wc) { 527 case '<': 528 EMIT(OBOW, 0); 529 break; 530 case '>': 531 EMIT(OEOW, 0); 532 break; 533 default: 534 if (may_escape(p, wc)) 535 ordinary(p, wc); 536 else 537 SETERROR(REG_EESCAPE); 538 break; 539 } 540 break; 541 #ifdef NO_STRICT_REGEX 542 case '{': /* okay as ordinary except if digit follows */ 543 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 544 /* FALLTHROUGH */ 545 #endif 546 default: 547 if (p->error != 0) 548 return (false); 549 p->next--; 550 wc = WGETNEXT(); 551 ordinary(p, wc); 552 break; 553 } 554 555 if (!MORE()) 556 return (false); 557 c = PEEK(); 558 /* we call { a repetition if followed by a digit */ 559 if (!( c == '*' || c == '+' || c == '?' || 560 #ifdef NO_STRICT_REGEX 561 (c == '{' && MORE2() && isdigit((uch)PEEK2())) 562 #else 563 c == '{' 564 #endif 565 )) 566 return (false); /* no repetition, we're done */ 567 #ifndef NO_STRICT_REGEX 568 else if (c == '{') 569 (void)REQUIRE(MORE2() && \ 570 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); 571 #endif 572 NEXT(); 573 574 (void)REQUIRE(!wascaret, REG_BADRPT); 575 switch (c) { 576 case '*': /* implemented as +? */ 577 /* this case does not require the (y|) trick, noKLUDGE */ 578 INSERT(OPLUS_, pos); 579 ASTERN(O_PLUS, pos); 580 INSERT(OQUEST_, pos); 581 ASTERN(O_QUEST, pos); 582 break; 583 case '+': 584 INSERT(OPLUS_, pos); 585 ASTERN(O_PLUS, pos); 586 break; 587 case '?': 588 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 589 INSERT(OCH_, pos); /* offset slightly wrong */ 590 ASTERN(OOR1, pos); /* this one's right */ 591 AHEAD(pos); /* fix the OCH_ */ 592 EMIT(OOR2, 0); /* offset very wrong... */ 593 AHEAD(THERE()); /* ...so fix it */ 594 ASTERN(O_CH, THERETHERE()); 595 break; 596 case '{': 597 count = p_count(p); 598 if (EAT(',')) { 599 if (isdigit((uch)PEEK())) { 600 count2 = p_count(p); 601 (void)REQUIRE(count <= count2, REG_BADBR); 602 } else /* single number with comma */ 603 count2 = INFINITY; 604 } else /* just a single number */ 605 count2 = count; 606 repeat(p, pos, count, count2); 607 if (!EAT('}')) { /* error heuristics */ 608 while (MORE() && PEEK() != '}') 609 NEXT(); 610 (void)REQUIRE(MORE(), REG_EBRACE); 611 SETERROR(REG_BADBR); 612 } 613 break; 614 } 615 616 if (!MORE()) 617 return (false); 618 c = PEEK(); 619 if (!( c == '*' || c == '+' || c == '?' || 620 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 621 return (false); 622 SETERROR(REG_BADRPT); 623 return (false); 624 } 625 626 /* 627 - p_str - string (no metacharacters) "parser" 628 == static void p_str(struct parse *p); 629 */ 630 static void 631 p_str(struct parse *p) 632 { 633 (void)REQUIRE(MORE(), REG_EMPTY); 634 while (MORE()) 635 ordinary(p, WGETNEXT()); 636 } 637 638 /* 639 * Eat consecutive branch delimiters for the kind of expression that we are 640 * parsing, return the number of delimiters that we ate. 641 */ 642 static int 643 p_branch_eat_delim(struct parse *p, struct branchc *bc) 644 { 645 int nskip; 646 647 (void)bc; 648 nskip = 0; 649 while (EATSPEC('|')) 650 ++nskip; 651 return (nskip); 652 } 653 654 /* 655 * Insert necessary branch book-keeping operations. This emits a 656 * bogus 'next' offset, since we still have more to parse 657 */ 658 static void 659 p_branch_ins_offset(struct parse *p, struct branchc *bc) 660 { 661 662 if (bc->nbranch == 0) { 663 INSERT(OCH_, bc->start); /* offset is wrong */ 664 bc->fwd = bc->start; 665 bc->back = bc->start; 666 } 667 668 ASTERN(OOR1, bc->back); 669 bc->back = THERE(); 670 AHEAD(bc->fwd); /* fix previous offset */ 671 bc->fwd = HERE(); 672 EMIT(OOR2, 0); /* offset is very wrong */ 673 ++bc->nbranch; 674 } 675 676 /* 677 * Fix the offset of the tail branch, if we actually had any branches. 678 * This is to correct the bogus placeholder offset that we use. 679 */ 680 static void 681 p_branch_fix_tail(struct parse *p, struct branchc *bc) 682 { 683 684 /* Fix bogus offset at the tail if we actually have branches */ 685 if (bc->nbranch > 0) { 686 AHEAD(bc->fwd); 687 ASTERN(O_CH, bc->back); 688 } 689 } 690 691 /* 692 * Signal to the parser that an empty branch has been encountered; this will, 693 * in the future, be used to allow for more permissive behavior with empty 694 * branches. The return value should indicate whether parsing may continue 695 * or not. 696 */ 697 static bool 698 p_branch_empty(struct parse *p, struct branchc *bc) 699 { 700 701 (void)bc; 702 SETERROR(REG_EMPTY); 703 return (false); 704 } 705 706 /* 707 * Take care of any branching requirements. This includes inserting the 708 * appropriate branching instructions as well as eating all of the branch 709 * delimiters until we either run out of pattern or need to parse more pattern. 710 */ 711 static bool 712 p_branch_do(struct parse *p, struct branchc *bc) 713 { 714 int ate = 0; 715 716 ate = p_branch_eat_delim(p, bc); 717 if (ate == 0) 718 return (false); 719 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) 720 /* 721 * Halt parsing only if we have an empty branch and p_branch_empty 722 * indicates that we must not continue. In the future, this will not 723 * necessarily be an error. 724 */ 725 return (false); 726 p_branch_ins_offset(p, bc); 727 728 return (true); 729 } 730 731 static void 732 p_bre_pre_parse(struct parse *p, struct branchc *bc) 733 { 734 735 (void) bc; 736 /* 737 * Does not move cleanly into expression parser because of 738 * ordinary interpration of * at the beginning position of 739 * an expression. 740 */ 741 if (EAT('^')) { 742 EMIT(OBOL, 0); 743 p->g->iflags |= USEBOL; 744 p->g->nbol++; 745 } 746 } 747 748 static void 749 p_bre_post_parse(struct parse *p, struct branchc *bc) 750 { 751 752 /* Expression is terminating due to EOL token */ 753 if (bc->terminate) { 754 DROP(1); 755 EMIT(OEOL, 0); 756 p->g->iflags |= USEEOL; 757 p->g->neol++; 758 } 759 } 760 761 /* 762 - p_re - Top level parser, concatenation and BRE anchoring 763 == static void p_re(struct parse *p, int end1, int end2); 764 * Giving end1 as OUT essentially eliminates the end1/end2 check. 765 * 766 * This implementation is a bit of a kludge, in that a trailing $ is first 767 * taken as an ordinary character and then revised to be an anchor. 768 * The amount of lookahead needed to avoid this kludge is excessive. 769 */ 770 static void 771 p_re(struct parse *p, 772 int end1, /* first terminating character */ 773 int end2) /* second terminating character; ignored for EREs */ 774 { 775 struct branchc bc; 776 777 bc.nbranch = 0; 778 if (end1 == OUT && end2 == OUT) 779 bc.outer = true; 780 else 781 bc.outer = false; 782 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) 783 for (;;) { 784 bc.start = HERE(); 785 bc.nchain = 0; 786 bc.terminate = false; 787 if (p->pre_parse != NULL) 788 p->pre_parse(p, &bc); 789 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) { 790 bc.terminate = p->parse_expr(p, &bc); 791 ++bc.nchain; 792 } 793 if (p->post_parse != NULL) 794 p->post_parse(p, &bc); 795 (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY); 796 #ifdef LIBREGEX 797 if (HERE() == bc.start && !p_branch_empty(p, &bc)) 798 break; 799 #endif 800 if (!p->allowbranch) 801 break; 802 /* 803 * p_branch_do's return value indicates whether we should 804 * continue parsing or not. This is both for correctness and 805 * a slight optimization, because it will check if we've 806 * encountered an empty branch or the end of the string 807 * immediately following a branch delimiter. 808 */ 809 if (!p_branch_do(p, &bc)) 810 break; 811 } 812 #undef SEE_END 813 if (p->allowbranch) 814 p_branch_fix_tail(p, &bc); 815 assert(!MORE() || SEE(end1)); 816 } 817 818 /* 819 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 820 == static bool p_simp_re(struct parse *p, struct branchc *bc); 821 */ 822 static bool /* was the simple RE an unbackslashed $? */ 823 p_simp_re(struct parse *p, struct branchc *bc) 824 { 825 int c; 826 int cc; /* convenient/control character */ 827 int count; 828 int count2; 829 sopno pos; 830 bool handled; 831 int i; 832 wint_t wc; 833 sopno subno; 834 # define BACKSL (1<<CHAR_BIT) 835 836 pos = HERE(); /* repetition op, if any, covers from here */ 837 handled = false; 838 839 assert(MORE()); /* caller should have ensured this */ 840 c = (uch)GETNEXT(); 841 if (c == '\\') { 842 (void)REQUIRE(MORE(), REG_EESCAPE); 843 cc = (uch)GETNEXT(); 844 c = BACKSL | cc; 845 #ifdef LIBREGEX 846 if (p->gnuext) { 847 handled = true; 848 switch (c) { 849 case BACKSL|'`': 850 EMIT(OBOS, 0); 851 break; 852 case BACKSL|'\'': 853 EMIT(OEOS, 0); 854 break; 855 case BACKSL|'B': 856 EMIT(ONWBND, 0); 857 break; 858 case BACKSL|'b': 859 EMIT(OWBND, 0); 860 break; 861 case BACKSL|'W': 862 case BACKSL|'w': 863 case BACKSL|'S': 864 case BACKSL|'s': 865 p_b_pseudoclass(p, cc); 866 break; 867 default: 868 handled = false; 869 } 870 } 871 #endif 872 } 873 if (!handled) { 874 switch (c) { 875 case '.': 876 if (p->g->cflags®_NEWLINE) 877 nonnewline(p); 878 else 879 EMIT(OANY, 0); 880 break; 881 case '[': 882 p_bracket(p); 883 break; 884 case BACKSL|'<': 885 EMIT(OBOW, 0); 886 break; 887 case BACKSL|'>': 888 EMIT(OEOW, 0); 889 break; 890 case BACKSL|'{': 891 SETERROR(REG_BADRPT); 892 break; 893 case BACKSL|'(': 894 p->g->nsub++; 895 subno = p->g->nsub; 896 if (subno < NPAREN) 897 p->pbegin[subno] = HERE(); 898 EMIT(OLPAREN, subno); 899 /* the MORE here is an error heuristic */ 900 if (MORE() && !SEETWO('\\', ')')) 901 p_re(p, '\\', ')'); 902 if (subno < NPAREN) { 903 p->pend[subno] = HERE(); 904 assert(p->pend[subno] != 0); 905 } 906 EMIT(ORPAREN, subno); 907 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 908 break; 909 case BACKSL|')': /* should not get here -- must be user */ 910 #ifdef NO_STRICT_REGEX 911 case BACKSL|'}': 912 #endif 913 SETERROR(REG_EPAREN); 914 break; 915 case BACKSL|'1': 916 case BACKSL|'2': 917 case BACKSL|'3': 918 case BACKSL|'4': 919 case BACKSL|'5': 920 case BACKSL|'6': 921 case BACKSL|'7': 922 case BACKSL|'8': 923 case BACKSL|'9': 924 i = (c&~BACKSL) - '0'; 925 assert(i < NPAREN); 926 if (p->pend[i] != 0) { 927 assert(i <= p->g->nsub); 928 EMIT(OBACK_, i); 929 assert(p->pbegin[i] != 0); 930 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 931 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 932 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 933 EMIT(O_BACK, i); 934 } else 935 SETERROR(REG_ESUBREG); 936 p->g->backrefs = 1; 937 break; 938 case '*': 939 /* 940 * Ordinary if used as the first character beyond BOL anchor of 941 * a (sub-)expression, counts as a bad repetition operator if it 942 * appears otherwise. 943 */ 944 (void)REQUIRE(bc->nchain == 0, REG_BADRPT); 945 /* FALLTHROUGH */ 946 default: 947 if (p->error != 0) 948 return (false); /* Definitely not $... */ 949 p->next--; 950 wc = WGETNEXT(); 951 if ((c & BACKSL) == 0 || may_escape(p, wc)) 952 ordinary(p, wc); 953 else 954 SETERROR(REG_EESCAPE); 955 break; 956 } 957 } 958 959 if (EAT('*')) { /* implemented as +? */ 960 /* this case does not require the (y|) trick, noKLUDGE */ 961 INSERT(OPLUS_, pos); 962 ASTERN(O_PLUS, pos); 963 INSERT(OQUEST_, pos); 964 ASTERN(O_QUEST, pos); 965 #ifdef LIBREGEX 966 } else if (p->gnuext && EATTWO('\\', '?')) { 967 INSERT(OQUEST_, pos); 968 ASTERN(O_QUEST, pos); 969 } else if (p->gnuext && EATTWO('\\', '+')) { 970 INSERT(OPLUS_, pos); 971 ASTERN(O_PLUS, pos); 972 #endif 973 } else if (EATTWO('\\', '{')) { 974 count = p_count(p); 975 if (EAT(',')) { 976 if (MORE() && isdigit((uch)PEEK())) { 977 count2 = p_count(p); 978 (void)REQUIRE(count <= count2, REG_BADBR); 979 } else /* single number with comma */ 980 count2 = INFINITY; 981 } else /* just a single number */ 982 count2 = count; 983 repeat(p, pos, count, count2); 984 if (!EATTWO('\\', '}')) { /* error heuristics */ 985 while (MORE() && !SEETWO('\\', '}')) 986 NEXT(); 987 (void)REQUIRE(MORE(), REG_EBRACE); 988 SETERROR(REG_BADBR); 989 } 990 } else if (c == '$') /* $ (but not \$) ends it */ 991 return (true); 992 993 return (false); 994 } 995 996 /* 997 - p_count - parse a repetition count 998 == static int p_count(struct parse *p); 999 */ 1000 static int /* the value */ 1001 p_count(struct parse *p) 1002 { 1003 int count = 0; 1004 int ndigits = 0; 1005 1006 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 1007 count = count*10 + ((uch)GETNEXT() - '0'); 1008 ndigits++; 1009 } 1010 1011 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 1012 return(count); 1013 } 1014 1015 /* 1016 - p_bracket - parse a bracketed character list 1017 == static void p_bracket(struct parse *p); 1018 */ 1019 static void 1020 p_bracket(struct parse *p) 1021 { 1022 cset *cs; 1023 wint_t ch; 1024 1025 /* Dept of Truly Sickening Special-Case Kludges */ 1026 if (p->end - p->next > 5) { 1027 if (strncmp(p->next, "[:<:]]", 6) == 0) { 1028 EMIT(OBOW, 0); 1029 NEXTn(6); 1030 return; 1031 } 1032 if (strncmp(p->next, "[:>:]]", 6) == 0) { 1033 EMIT(OEOW, 0); 1034 NEXTn(6); 1035 return; 1036 } 1037 } 1038 1039 if ((cs = allocset(p)) == NULL) 1040 return; 1041 1042 if (p->g->cflags®_ICASE) 1043 cs->icase = 1; 1044 if (EAT('^')) 1045 cs->invert = 1; 1046 if (EAT(']')) 1047 CHadd(p, cs, ']'); 1048 else if (EAT('-')) 1049 CHadd(p, cs, '-'); 1050 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 1051 p_b_term(p, cs); 1052 if (EAT('-')) 1053 CHadd(p, cs, '-'); 1054 (void)MUSTEAT(']', REG_EBRACK); 1055 1056 if (p->error != 0) /* don't mess things up further */ 1057 return; 1058 1059 if (cs->invert && p->g->cflags®_NEWLINE) 1060 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 1061 1062 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 1063 ordinary(p, ch); 1064 freeset(p, cs); 1065 } else 1066 EMIT(OANYOF, (int)(cs - p->g->sets)); 1067 } 1068 1069 static int 1070 p_range_cmp(wchar_t c1, wchar_t c2) 1071 { 1072 #ifndef LIBREGEX 1073 return __wcollate_range_cmp(c1, c2); 1074 #else 1075 /* Copied from libc/collate __wcollate_range_cmp */ 1076 wchar_t s1[2], s2[2]; 1077 1078 s1[0] = c1; 1079 s1[1] = L'\0'; 1080 s2[0] = c2; 1081 s2[1] = L'\0'; 1082 return (wcscoll(s1, s2)); 1083 #endif 1084 } 1085 1086 /* 1087 - p_b_term - parse one term of a bracketed character list 1088 == static void p_b_term(struct parse *p, cset *cs); 1089 */ 1090 static void 1091 p_b_term(struct parse *p, cset *cs) 1092 { 1093 char c; 1094 wint_t start, finish; 1095 wint_t i; 1096 #ifndef LIBREGEX 1097 struct xlocale_collate *table = 1098 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 1099 #endif 1100 /* classify what we've got */ 1101 switch ((MORE()) ? PEEK() : '\0') { 1102 case '[': 1103 c = (MORE2()) ? PEEK2() : '\0'; 1104 break; 1105 case '-': 1106 SETERROR(REG_ERANGE); 1107 return; /* NOTE RETURN */ 1108 default: 1109 c = '\0'; 1110 break; 1111 } 1112 1113 switch (c) { 1114 case ':': /* character class */ 1115 NEXT2(); 1116 (void)REQUIRE(MORE(), REG_EBRACK); 1117 c = PEEK(); 1118 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 1119 p_b_cclass(p, cs); 1120 (void)REQUIRE(MORE(), REG_EBRACK); 1121 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 1122 break; 1123 case '=': /* equivalence class */ 1124 NEXT2(); 1125 (void)REQUIRE(MORE(), REG_EBRACK); 1126 c = PEEK(); 1127 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 1128 p_b_eclass(p, cs); 1129 (void)REQUIRE(MORE(), REG_EBRACK); 1130 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 1131 break; 1132 default: /* symbol, ordinary character, or range */ 1133 start = p_b_symbol(p); 1134 if (SEE('-') && MORE2() && PEEK2() != ']') { 1135 /* range */ 1136 NEXT(); 1137 if (EAT('-')) 1138 finish = '-'; 1139 else 1140 finish = p_b_symbol(p); 1141 } else 1142 finish = start; 1143 if (start == finish) 1144 CHadd(p, cs, start); 1145 else { 1146 #ifndef LIBREGEX 1147 if (table->__collate_load_error || MB_CUR_MAX > 1) { 1148 #else 1149 if (MB_CUR_MAX > 1) { 1150 #endif 1151 (void)REQUIRE(start <= finish, REG_ERANGE); 1152 CHaddrange(p, cs, start, finish); 1153 } else { 1154 (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE); 1155 for (i = 0; i <= UCHAR_MAX; i++) { 1156 if (p_range_cmp(start, i) <= 0 && 1157 p_range_cmp(i, finish) <= 0 ) 1158 CHadd(p, cs, i); 1159 } 1160 } 1161 } 1162 break; 1163 } 1164 } 1165 1166 /* 1167 - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S) 1168 == static int p_b_pseudoclass(struct parse *p, char c) 1169 */ 1170 static int 1171 p_b_pseudoclass(struct parse *p, char c) { 1172 cset *cs; 1173 1174 if ((cs = allocset(p)) == NULL) 1175 return(0); 1176 1177 if (p->g->cflags®_ICASE) 1178 cs->icase = 1; 1179 1180 switch (c) { 1181 case 'W': 1182 cs->invert = 1; 1183 /* PASSTHROUGH */ 1184 case 'w': 1185 p_b_cclass_named(p, cs, "alnum"); 1186 CHadd(p, cs, '_'); 1187 break; 1188 case 'S': 1189 cs->invert = 1; 1190 /* PASSTHROUGH */ 1191 case 's': 1192 p_b_cclass_named(p, cs, "space"); 1193 break; 1194 default: 1195 return(0); 1196 } 1197 1198 EMIT(OANYOF, (int)(cs - p->g->sets)); 1199 return(1); 1200 } 1201 1202 /* 1203 - p_b_cclass - parse a character-class name and deal with it 1204 == static void p_b_cclass(struct parse *p, cset *cs); 1205 */ 1206 static void 1207 p_b_cclass(struct parse *p, cset *cs) 1208 { 1209 const char *sp = p->next; 1210 size_t len; 1211 char clname[16]; 1212 1213 while (MORE() && isalpha((uch)PEEK())) 1214 NEXT(); 1215 len = p->next - sp; 1216 if (len >= sizeof(clname) - 1) { 1217 SETERROR(REG_ECTYPE); 1218 return; 1219 } 1220 memcpy(clname, sp, len); 1221 clname[len] = '\0'; 1222 1223 p_b_cclass_named(p, cs, clname); 1224 } 1225 /* 1226 - p_b_cclass_named - deal with a named character class 1227 == static void p_b_cclass_named(struct parse *p, cset *cs, const char []); 1228 */ 1229 static void 1230 p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) { 1231 wctype_t wct; 1232 1233 if ((wct = wctype(clname)) == 0) { 1234 SETERROR(REG_ECTYPE); 1235 return; 1236 } 1237 CHaddtype(p, cs, wct); 1238 } 1239 1240 /* 1241 - p_b_eclass - parse an equivalence-class name and deal with it 1242 == static void p_b_eclass(struct parse *p, cset *cs); 1243 * 1244 * This implementation is incomplete. xxx 1245 */ 1246 static void 1247 p_b_eclass(struct parse *p, cset *cs) 1248 { 1249 wint_t c; 1250 1251 c = p_b_coll_elem(p, '='); 1252 CHadd(p, cs, c); 1253 } 1254 1255 /* 1256 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 1257 == static wint_t p_b_symbol(struct parse *p); 1258 */ 1259 static wint_t /* value of symbol */ 1260 p_b_symbol(struct parse *p) 1261 { 1262 wint_t value; 1263 1264 (void)REQUIRE(MORE(), REG_EBRACK); 1265 if (!EATTWO('[', '.')) 1266 return(WGETNEXT()); 1267 1268 /* collating symbol */ 1269 value = p_b_coll_elem(p, '.'); 1270 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 1271 return(value); 1272 } 1273 1274 /* 1275 - p_b_coll_elem - parse a collating-element name and look it up 1276 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 1277 */ 1278 static wint_t /* value of collating element */ 1279 p_b_coll_elem(struct parse *p, 1280 wint_t endc) /* name ended by endc,']' */ 1281 { 1282 const char *sp = p->next; 1283 struct cname *cp; 1284 mbstate_t mbs; 1285 wchar_t wc; 1286 size_t clen, len; 1287 1288 while (MORE() && !SEETWO(endc, ']')) 1289 NEXT(); 1290 if (!MORE()) { 1291 SETERROR(REG_EBRACK); 1292 return(0); 1293 } 1294 len = p->next - sp; 1295 for (cp = cnames; cp->name != NULL; cp++) 1296 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1297 return(cp->code); /* known name */ 1298 memset(&mbs, 0, sizeof(mbs)); 1299 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 1300 return (wc); /* single character */ 1301 else if (clen == (size_t)-1 || clen == (size_t)-2) 1302 SETERROR(REG_ILLSEQ); 1303 else 1304 SETERROR(REG_ECOLLATE); /* neither */ 1305 return(0); 1306 } 1307 1308 /* 1309 - may_escape - determine whether 'ch' is escape-able in the current context 1310 == static int may_escape(struct parse *p, const wint_t ch) 1311 */ 1312 static bool 1313 may_escape(struct parse *p, const wint_t ch) 1314 { 1315 1316 if ((p->pflags & PFLAG_LEGACY_ESC) != 0) 1317 return (true); 1318 if (iswalpha(ch) || ch == '\'' || ch == '`') 1319 return (false); 1320 return (true); 1321 #ifdef NOTYET 1322 /* 1323 * Build a whitelist of characters that may be escaped to produce an 1324 * ordinary in the current context. This assumes that these have not 1325 * been otherwise interpreted as a special character. Escaping an 1326 * ordinary character yields undefined results according to 1327 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take 1328 * advantage of this and use escaped ordinary characters to provide 1329 * special meaning, e.g. \b, \B, \w, \W, \s, \S. 1330 */ 1331 switch(ch) { 1332 case '|': 1333 case '+': 1334 case '?': 1335 /* The above characters may not be escaped in BREs */ 1336 if (!(p->g->cflags®_EXTENDED)) 1337 return (false); 1338 /* Fallthrough */ 1339 case '(': 1340 case ')': 1341 case '{': 1342 case '}': 1343 case '.': 1344 case '[': 1345 case ']': 1346 case '\\': 1347 case '*': 1348 case '^': 1349 case '$': 1350 return (true); 1351 default: 1352 return (false); 1353 } 1354 #endif 1355 } 1356 1357 /* 1358 - othercase - return the case counterpart of an alphabetic 1359 == static wint_t othercase(wint_t ch); 1360 */ 1361 static wint_t /* if no counterpart, return ch */ 1362 othercase(wint_t ch) 1363 { 1364 assert(iswalpha(ch)); 1365 if (iswupper(ch)) 1366 return(towlower(ch)); 1367 else if (iswlower(ch)) 1368 return(towupper(ch)); 1369 else /* peculiar, but could happen */ 1370 return(ch); 1371 } 1372 1373 /* 1374 - bothcases - emit a dualcase version of a two-case character 1375 == static void bothcases(struct parse *p, wint_t ch); 1376 * 1377 * Boy, is this implementation ever a kludge... 1378 */ 1379 static void 1380 bothcases(struct parse *p, wint_t ch) 1381 { 1382 const char *oldnext = p->next; 1383 const char *oldend = p->end; 1384 char bracket[3 + MB_LEN_MAX]; 1385 size_t n; 1386 mbstate_t mbs; 1387 1388 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1389 p->next = bracket; 1390 memset(&mbs, 0, sizeof(mbs)); 1391 n = wcrtomb(bracket, ch, &mbs); 1392 assert(n != (size_t)-1); 1393 bracket[n] = ']'; 1394 bracket[n + 1] = '\0'; 1395 p->end = bracket+n+1; 1396 p_bracket(p); 1397 assert(p->next == p->end); 1398 p->next = oldnext; 1399 p->end = oldend; 1400 } 1401 1402 /* 1403 - ordinary - emit an ordinary character 1404 == static void ordinary(struct parse *p, wint_t ch); 1405 */ 1406 static void 1407 ordinary(struct parse *p, wint_t ch) 1408 { 1409 cset *cs; 1410 1411 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 1412 bothcases(p, ch); 1413 else if ((ch & OPDMASK) == ch) 1414 EMIT(OCHAR, ch); 1415 else { 1416 /* 1417 * Kludge: character is too big to fit into an OCHAR operand. 1418 * Emit a singleton set. 1419 */ 1420 if ((cs = allocset(p)) == NULL) 1421 return; 1422 CHadd(p, cs, ch); 1423 EMIT(OANYOF, (int)(cs - p->g->sets)); 1424 } 1425 } 1426 1427 /* 1428 - nonnewline - emit REG_NEWLINE version of OANY 1429 == static void nonnewline(struct parse *p); 1430 * 1431 * Boy, is this implementation ever a kludge... 1432 */ 1433 static void 1434 nonnewline(struct parse *p) 1435 { 1436 const char *oldnext = p->next; 1437 const char *oldend = p->end; 1438 char bracket[4]; 1439 1440 p->next = bracket; 1441 p->end = bracket+3; 1442 bracket[0] = '^'; 1443 bracket[1] = '\n'; 1444 bracket[2] = ']'; 1445 bracket[3] = '\0'; 1446 p_bracket(p); 1447 assert(p->next == bracket+3); 1448 p->next = oldnext; 1449 p->end = oldend; 1450 } 1451 1452 /* 1453 - repeat - generate code for a bounded repetition, recursively if needed 1454 == static void repeat(struct parse *p, sopno start, int from, int to); 1455 */ 1456 static void 1457 repeat(struct parse *p, 1458 sopno start, /* operand from here to end of strip */ 1459 int from, /* repeated from this number */ 1460 int to) /* to this number of times (maybe INFINITY) */ 1461 { 1462 sopno finish = HERE(); 1463 # define N 2 1464 # define INF 3 1465 # define REP(f, t) ((f)*8 + (t)) 1466 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1467 sopno copy; 1468 1469 if (p->error != 0) /* head off possible runaway recursion */ 1470 return; 1471 1472 assert(from <= to); 1473 1474 switch (REP(MAP(from), MAP(to))) { 1475 case REP(0, 0): /* must be user doing this */ 1476 DROP(finish-start); /* drop the operand */ 1477 break; 1478 case REP(0, 1): /* as x{1,1}? */ 1479 case REP(0, N): /* as x{1,n}? */ 1480 case REP(0, INF): /* as x{1,}? */ 1481 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1482 INSERT(OCH_, start); /* offset is wrong... */ 1483 repeat(p, start+1, 1, to); 1484 ASTERN(OOR1, start); 1485 AHEAD(start); /* ... fix it */ 1486 EMIT(OOR2, 0); 1487 AHEAD(THERE()); 1488 ASTERN(O_CH, THERETHERE()); 1489 break; 1490 case REP(1, 1): /* trivial case */ 1491 /* done */ 1492 break; 1493 case REP(1, N): /* as x?x{1,n-1} */ 1494 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1495 INSERT(OCH_, start); 1496 ASTERN(OOR1, start); 1497 AHEAD(start); 1498 EMIT(OOR2, 0); /* offset very wrong... */ 1499 AHEAD(THERE()); /* ...so fix it */ 1500 ASTERN(O_CH, THERETHERE()); 1501 copy = dupl(p, start+1, finish+1); 1502 assert(copy == finish+4); 1503 repeat(p, copy, 1, to-1); 1504 break; 1505 case REP(1, INF): /* as x+ */ 1506 INSERT(OPLUS_, start); 1507 ASTERN(O_PLUS, start); 1508 break; 1509 case REP(N, N): /* as xx{m-1,n-1} */ 1510 copy = dupl(p, start, finish); 1511 repeat(p, copy, from-1, to-1); 1512 break; 1513 case REP(N, INF): /* as xx{n-1,INF} */ 1514 copy = dupl(p, start, finish); 1515 repeat(p, copy, from-1, to); 1516 break; 1517 default: /* "can't happen" */ 1518 SETERROR(REG_ASSERT); /* just in case */ 1519 break; 1520 } 1521 } 1522 1523 /* 1524 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1525 - character from the parse struct, signals a REG_ILLSEQ error if the 1526 - character can't be converted. Returns the number of bytes consumed. 1527 */ 1528 static wint_t 1529 wgetnext(struct parse *p) 1530 { 1531 mbstate_t mbs; 1532 wchar_t wc; 1533 size_t n; 1534 1535 memset(&mbs, 0, sizeof(mbs)); 1536 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1537 if (n == (size_t)-1 || n == (size_t)-2) { 1538 SETERROR(REG_ILLSEQ); 1539 return (0); 1540 } 1541 if (n == 0) 1542 n = 1; 1543 p->next += n; 1544 return (wc); 1545 } 1546 1547 /* 1548 - seterr - set an error condition 1549 == static int seterr(struct parse *p, int e); 1550 */ 1551 static int /* useless but makes type checking happy */ 1552 seterr(struct parse *p, int e) 1553 { 1554 if (p->error == 0) /* keep earliest error condition */ 1555 p->error = e; 1556 p->next = nuls; /* try to bring things to a halt */ 1557 p->end = nuls; 1558 return(0); /* make the return value well-defined */ 1559 } 1560 1561 /* 1562 - allocset - allocate a set of characters for [] 1563 == static cset *allocset(struct parse *p); 1564 */ 1565 static cset * 1566 allocset(struct parse *p) 1567 { 1568 cset *cs, *ncs; 1569 1570 ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs)); 1571 if (ncs == NULL) { 1572 SETERROR(REG_ESPACE); 1573 return (NULL); 1574 } 1575 p->g->sets = ncs; 1576 cs = &p->g->sets[p->g->ncsets++]; 1577 memset(cs, 0, sizeof(*cs)); 1578 1579 return(cs); 1580 } 1581 1582 /* 1583 - freeset - free a now-unused set 1584 == static void freeset(struct parse *p, cset *cs); 1585 */ 1586 static void 1587 freeset(struct parse *p, cset *cs) 1588 { 1589 cset *top = &p->g->sets[p->g->ncsets]; 1590 1591 free(cs->wides); 1592 free(cs->ranges); 1593 free(cs->types); 1594 memset(cs, 0, sizeof(*cs)); 1595 if (cs == top-1) /* recover only the easy case */ 1596 p->g->ncsets--; 1597 } 1598 1599 /* 1600 - singleton - Determine whether a set contains only one character, 1601 - returning it if so, otherwise returning OUT. 1602 */ 1603 static wint_t 1604 singleton(cset *cs) 1605 { 1606 wint_t i, s, n; 1607 1608 /* Exclude the complicated cases we don't want to deal with */ 1609 if (cs->nranges != 0 || cs->ntypes != 0 || cs->icase != 0) 1610 return (OUT); 1611 1612 if (cs->nwides > 1) 1613 return (OUT); 1614 1615 /* Count the number of characters present in the bitmap */ 1616 for (i = n = 0; i < NC; i++) 1617 if (CHIN(cs, i)) { 1618 n++; 1619 s = i; 1620 } 1621 1622 if (n > 1) 1623 return (OUT); 1624 1625 if (n == 1) { 1626 if (cs->nwides == 0) 1627 return (s); 1628 else 1629 return (OUT); 1630 } 1631 if (cs->nwides == 1) 1632 return (cs->wides[0]); 1633 1634 return (OUT); 1635 } 1636 1637 /* 1638 - CHadd - add character to character set. 1639 */ 1640 static void 1641 CHadd(struct parse *p, cset *cs, wint_t ch) 1642 { 1643 wint_t nch, *newwides; 1644 assert(ch >= 0); 1645 if (ch < NC) 1646 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1647 else { 1648 newwides = reallocarray(cs->wides, cs->nwides + 1, 1649 sizeof(*cs->wides)); 1650 if (newwides == NULL) { 1651 SETERROR(REG_ESPACE); 1652 return; 1653 } 1654 cs->wides = newwides; 1655 cs->wides[cs->nwides++] = ch; 1656 } 1657 if (cs->icase) { 1658 if ((nch = towlower(ch)) < NC) 1659 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1660 if ((nch = towupper(ch)) < NC) 1661 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1662 } 1663 } 1664 1665 /* 1666 - CHaddrange - add all characters in the range [min,max] to a character set. 1667 */ 1668 static void 1669 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1670 { 1671 crange *newranges; 1672 1673 for (; min < NC && min <= max; min++) 1674 CHadd(p, cs, min); 1675 if (min >= max) 1676 return; 1677 newranges = reallocarray(cs->ranges, cs->nranges + 1, 1678 sizeof(*cs->ranges)); 1679 if (newranges == NULL) { 1680 SETERROR(REG_ESPACE); 1681 return; 1682 } 1683 cs->ranges = newranges; 1684 cs->ranges[cs->nranges].min = min; 1685 cs->ranges[cs->nranges].max = max; 1686 cs->nranges++; 1687 } 1688 1689 /* 1690 - CHaddtype - add all characters of a certain type to a character set. 1691 */ 1692 static void 1693 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1694 { 1695 wint_t i; 1696 wctype_t *newtypes; 1697 1698 for (i = 0; i < NC; i++) 1699 if (iswctype(i, wct)) 1700 CHadd(p, cs, i); 1701 newtypes = reallocarray(cs->types, cs->ntypes + 1, 1702 sizeof(*cs->types)); 1703 if (newtypes == NULL) { 1704 SETERROR(REG_ESPACE); 1705 return; 1706 } 1707 cs->types = newtypes; 1708 cs->types[cs->ntypes++] = wct; 1709 } 1710 1711 /* 1712 - dupl - emit a duplicate of a bunch of sops 1713 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1714 */ 1715 static sopno /* start of duplicate */ 1716 dupl(struct parse *p, 1717 sopno start, /* from here */ 1718 sopno finish) /* to this less one */ 1719 { 1720 sopno ret = HERE(); 1721 sopno len = finish - start; 1722 1723 assert(finish >= start); 1724 if (len == 0) 1725 return(ret); 1726 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1727 return(ret); 1728 (void) memcpy((char *)(p->strip + p->slen), 1729 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1730 p->slen += len; 1731 return(ret); 1732 } 1733 1734 /* 1735 - doemit - emit a strip operator 1736 == static void doemit(struct parse *p, sop op, size_t opnd); 1737 * 1738 * It might seem better to implement this as a macro with a function as 1739 * hard-case backup, but it's just too big and messy unless there are 1740 * some changes to the data structures. Maybe later. 1741 */ 1742 static void 1743 doemit(struct parse *p, sop op, size_t opnd) 1744 { 1745 /* avoid making error situations worse */ 1746 if (p->error != 0) 1747 return; 1748 1749 /* deal with oversize operands ("can't happen", more or less) */ 1750 assert(opnd < 1<<OPSHIFT); 1751 1752 /* deal with undersized strip */ 1753 if (p->slen >= p->ssize) 1754 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1755 return; 1756 1757 /* finally, it's all reduced to the easy case */ 1758 p->strip[p->slen++] = SOP(op, opnd); 1759 } 1760 1761 /* 1762 - doinsert - insert a sop into the strip 1763 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1764 */ 1765 static void 1766 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1767 { 1768 sopno sn; 1769 sop s; 1770 int i; 1771 1772 /* avoid making error situations worse */ 1773 if (p->error != 0) 1774 return; 1775 1776 sn = HERE(); 1777 EMIT(op, opnd); /* do checks, ensure space */ 1778 assert(HERE() == sn+1); 1779 s = p->strip[sn]; 1780 1781 /* adjust paren pointers */ 1782 assert(pos > 0); 1783 for (i = 1; i < NPAREN; i++) { 1784 if (p->pbegin[i] >= pos) { 1785 p->pbegin[i]++; 1786 } 1787 if (p->pend[i] >= pos) { 1788 p->pend[i]++; 1789 } 1790 } 1791 1792 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1793 (HERE()-pos-1)*sizeof(sop)); 1794 p->strip[pos] = s; 1795 } 1796 1797 /* 1798 - dofwd - complete a forward reference 1799 == static void dofwd(struct parse *p, sopno pos, sop value); 1800 */ 1801 static void 1802 dofwd(struct parse *p, sopno pos, sop value) 1803 { 1804 /* avoid making error situations worse */ 1805 if (p->error != 0) 1806 return; 1807 1808 assert(value < 1<<OPSHIFT); 1809 p->strip[pos] = OP(p->strip[pos]) | value; 1810 } 1811 1812 /* 1813 - enlarge - enlarge the strip 1814 == static int enlarge(struct parse *p, sopno size); 1815 */ 1816 static int 1817 enlarge(struct parse *p, sopno size) 1818 { 1819 sop *sp; 1820 1821 if (p->ssize >= size) 1822 return 1; 1823 1824 sp = reallocarray(p->strip, size, sizeof(sop)); 1825 if (sp == NULL) { 1826 SETERROR(REG_ESPACE); 1827 return 0; 1828 } 1829 p->strip = sp; 1830 p->ssize = size; 1831 return 1; 1832 } 1833 1834 /* 1835 - stripsnug - compact the strip 1836 == static void stripsnug(struct parse *p, struct re_guts *g); 1837 */ 1838 static void 1839 stripsnug(struct parse *p, struct re_guts *g) 1840 { 1841 g->nstates = p->slen; 1842 g->strip = reallocarray((char *)p->strip, p->slen, sizeof(sop)); 1843 if (g->strip == NULL) { 1844 SETERROR(REG_ESPACE); 1845 g->strip = p->strip; 1846 } 1847 } 1848 1849 /* 1850 - findmust - fill in must and mlen with longest mandatory literal string 1851 == static void findmust(struct parse *p, struct re_guts *g); 1852 * 1853 * This algorithm could do fancy things like analyzing the operands of | 1854 * for common subsequences. Someday. This code is simple and finds most 1855 * of the interesting cases. 1856 * 1857 * Note that must and mlen got initialized during setup. 1858 */ 1859 static void 1860 findmust(struct parse *p, struct re_guts *g) 1861 { 1862 sop *scan; 1863 sop *start = NULL; 1864 sop *newstart = NULL; 1865 sopno newlen; 1866 sop s; 1867 char *cp; 1868 int offset; 1869 char buf[MB_LEN_MAX]; 1870 size_t clen; 1871 mbstate_t mbs; 1872 1873 /* avoid making error situations worse */ 1874 if (p->error != 0) 1875 return; 1876 1877 /* 1878 * It's not generally safe to do a ``char'' substring search on 1879 * multibyte character strings, but it's safe for at least 1880 * UTF-8 (see RFC 3629). 1881 */ 1882 if (MB_CUR_MAX > 1 && 1883 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1884 return; 1885 1886 /* find the longest OCHAR sequence in strip */ 1887 newlen = 0; 1888 offset = 0; 1889 g->moffset = 0; 1890 scan = g->strip + 1; 1891 do { 1892 s = *scan++; 1893 switch (OP(s)) { 1894 case OCHAR: /* sequence member */ 1895 if (newlen == 0) { /* new sequence */ 1896 memset(&mbs, 0, sizeof(mbs)); 1897 newstart = scan - 1; 1898 } 1899 clen = wcrtomb(buf, OPND(s), &mbs); 1900 if (clen == (size_t)-1) 1901 goto toohard; 1902 newlen += clen; 1903 break; 1904 case OPLUS_: /* things that don't break one */ 1905 case OLPAREN: 1906 case ORPAREN: 1907 break; 1908 case OQUEST_: /* things that must be skipped */ 1909 case OCH_: 1910 offset = altoffset(scan, offset); 1911 scan--; 1912 do { 1913 scan += OPND(s); 1914 s = *scan; 1915 /* assert() interferes w debug printouts */ 1916 if (OP(s) != (sop)O_QUEST && 1917 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) { 1918 g->iflags |= BAD; 1919 return; 1920 } 1921 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); 1922 /* FALLTHROUGH */ 1923 case OBOW: /* things that break a sequence */ 1924 case OEOW: 1925 case OBOL: 1926 case OEOL: 1927 case OBOS: 1928 case OEOS: 1929 case OWBND: 1930 case ONWBND: 1931 case O_QUEST: 1932 case O_CH: 1933 case OEND: 1934 if (newlen > (sopno)g->mlen) { /* ends one */ 1935 start = newstart; 1936 g->mlen = newlen; 1937 if (offset > -1) { 1938 g->moffset += offset; 1939 offset = newlen; 1940 } else 1941 g->moffset = offset; 1942 } else { 1943 if (offset > -1) 1944 offset += newlen; 1945 } 1946 newlen = 0; 1947 break; 1948 case OANY: 1949 if (newlen > (sopno)g->mlen) { /* ends one */ 1950 start = newstart; 1951 g->mlen = newlen; 1952 if (offset > -1) { 1953 g->moffset += offset; 1954 offset = newlen; 1955 } else 1956 g->moffset = offset; 1957 } else { 1958 if (offset > -1) 1959 offset += newlen; 1960 } 1961 if (offset > -1) 1962 offset++; 1963 newlen = 0; 1964 break; 1965 case OANYOF: /* may or may not invalidate offset */ 1966 /* First, everything as OANY */ 1967 if (newlen > (sopno)g->mlen) { /* ends one */ 1968 start = newstart; 1969 g->mlen = newlen; 1970 if (offset > -1) { 1971 g->moffset += offset; 1972 offset = newlen; 1973 } else 1974 g->moffset = offset; 1975 } else { 1976 if (offset > -1) 1977 offset += newlen; 1978 } 1979 if (offset > -1) 1980 offset++; 1981 newlen = 0; 1982 break; 1983 toohard: 1984 default: 1985 /* Anything here makes it impossible or too hard 1986 * to calculate the offset -- so we give up; 1987 * save the last known good offset, in case the 1988 * must sequence doesn't occur later. 1989 */ 1990 if (newlen > (sopno)g->mlen) { /* ends one */ 1991 start = newstart; 1992 g->mlen = newlen; 1993 if (offset > -1) 1994 g->moffset += offset; 1995 else 1996 g->moffset = offset; 1997 } 1998 offset = -1; 1999 newlen = 0; 2000 break; 2001 } 2002 } while (OP(s) != OEND); 2003 2004 if (g->mlen == 0) { /* there isn't one */ 2005 g->moffset = -1; 2006 return; 2007 } 2008 2009 /* turn it into a character string */ 2010 g->must = malloc((size_t)g->mlen + 1); 2011 if (g->must == NULL) { /* argh; just forget it */ 2012 g->mlen = 0; 2013 g->moffset = -1; 2014 return; 2015 } 2016 cp = g->must; 2017 scan = start; 2018 memset(&mbs, 0, sizeof(mbs)); 2019 while (cp < g->must + g->mlen) { 2020 while (OP(s = *scan++) != OCHAR) 2021 continue; 2022 clen = wcrtomb(cp, OPND(s), &mbs); 2023 assert(clen != (size_t)-1); 2024 cp += clen; 2025 } 2026 assert(cp == g->must + g->mlen); 2027 *cp++ = '\0'; /* just on general principles */ 2028 } 2029 2030 /* 2031 - altoffset - choose biggest offset among multiple choices 2032 == static int altoffset(sop *scan, int offset); 2033 * 2034 * Compute, recursively if necessary, the largest offset among multiple 2035 * re paths. 2036 */ 2037 static int 2038 altoffset(sop *scan, int offset) 2039 { 2040 int largest; 2041 int try; 2042 sop s; 2043 2044 /* If we gave up already on offsets, return */ 2045 if (offset == -1) 2046 return -1; 2047 2048 largest = 0; 2049 try = 0; 2050 s = *scan++; 2051 while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) { 2052 switch (OP(s)) { 2053 case OOR1: 2054 if (try > largest) 2055 largest = try; 2056 try = 0; 2057 break; 2058 case OQUEST_: 2059 case OCH_: 2060 try = altoffset(scan, try); 2061 if (try == -1) 2062 return -1; 2063 scan--; 2064 do { 2065 scan += OPND(s); 2066 s = *scan; 2067 if (OP(s) != (sop)O_QUEST && 2068 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) 2069 return -1; 2070 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); 2071 /* We must skip to the next position, or we'll 2072 * leave altoffset() too early. 2073 */ 2074 scan++; 2075 break; 2076 case OANYOF: 2077 case OCHAR: 2078 case OANY: 2079 try++; 2080 case OBOW: 2081 case OEOW: 2082 case OWBND: 2083 case ONWBND: 2084 case OLPAREN: 2085 case ORPAREN: 2086 case OOR2: 2087 break; 2088 default: 2089 try = -1; 2090 break; 2091 } 2092 if (try == -1) 2093 return -1; 2094 s = *scan++; 2095 } 2096 2097 if (try > largest) 2098 largest = try; 2099 2100 return largest+offset; 2101 } 2102 2103 /* 2104 - computejumps - compute char jumps for BM scan 2105 == static void computejumps(struct parse *p, struct re_guts *g); 2106 * 2107 * This algorithm assumes g->must exists and is has size greater than 2108 * zero. It's based on the algorithm found on Computer Algorithms by 2109 * Sara Baase. 2110 * 2111 * A char jump is the number of characters one needs to jump based on 2112 * the value of the character from the text that was mismatched. 2113 */ 2114 static void 2115 computejumps(struct parse *p, struct re_guts *g) 2116 { 2117 int ch; 2118 int mindex; 2119 2120 /* Avoid making errors worse */ 2121 if (p->error != 0) 2122 return; 2123 2124 g->charjump = (int *)malloc((NC_MAX + 1) * sizeof(int)); 2125 if (g->charjump == NULL) /* Not a fatal error */ 2126 return; 2127 /* Adjust for signed chars, if necessary */ 2128 g->charjump = &g->charjump[-(CHAR_MIN)]; 2129 2130 /* If the character does not exist in the pattern, the jump 2131 * is equal to the number of characters in the pattern. 2132 */ 2133 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 2134 g->charjump[ch] = g->mlen; 2135 2136 /* If the character does exist, compute the jump that would 2137 * take us to the last character in the pattern equal to it 2138 * (notice that we match right to left, so that last character 2139 * is the first one that would be matched). 2140 */ 2141 for (mindex = 0; mindex < g->mlen; mindex++) 2142 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 2143 } 2144 2145 /* 2146 - computematchjumps - compute match jumps for BM scan 2147 == static void computematchjumps(struct parse *p, struct re_guts *g); 2148 * 2149 * This algorithm assumes g->must exists and is has size greater than 2150 * zero. It's based on the algorithm found on Computer Algorithms by 2151 * Sara Baase. 2152 * 2153 * A match jump is the number of characters one needs to advance based 2154 * on the already-matched suffix. 2155 * Notice that all values here are minus (g->mlen-1), because of the way 2156 * the search algorithm works. 2157 */ 2158 static void 2159 computematchjumps(struct parse *p, struct re_guts *g) 2160 { 2161 int mindex; /* General "must" iterator */ 2162 int suffix; /* Keeps track of matching suffix */ 2163 int ssuffix; /* Keeps track of suffixes' suffix */ 2164 int* pmatches; /* pmatches[k] points to the next i 2165 * such that i+1...mlen is a substring 2166 * of k+1...k+mlen-i-1 2167 */ 2168 2169 /* Avoid making errors worse */ 2170 if (p->error != 0) 2171 return; 2172 2173 pmatches = (int*) malloc(g->mlen * sizeof(int)); 2174 if (pmatches == NULL) { 2175 g->matchjump = NULL; 2176 return; 2177 } 2178 2179 g->matchjump = (int*) malloc(g->mlen * sizeof(int)); 2180 if (g->matchjump == NULL) { /* Not a fatal error */ 2181 free(pmatches); 2182 return; 2183 } 2184 2185 /* Set maximum possible jump for each character in the pattern */ 2186 for (mindex = 0; mindex < g->mlen; mindex++) 2187 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 2188 2189 /* Compute pmatches[] */ 2190 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 2191 mindex--, suffix--) { 2192 pmatches[mindex] = suffix; 2193 2194 /* If a mismatch is found, interrupting the substring, 2195 * compute the matchjump for that position. If no 2196 * mismatch is found, then a text substring mismatched 2197 * against the suffix will also mismatch against the 2198 * substring. 2199 */ 2200 while (suffix < g->mlen 2201 && g->must[mindex] != g->must[suffix]) { 2202 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2203 g->mlen - mindex - 1); 2204 suffix = pmatches[suffix]; 2205 } 2206 } 2207 2208 /* Compute the matchjump up to the last substring found to jump 2209 * to the beginning of the largest must pattern prefix matching 2210 * it's own suffix. 2211 */ 2212 for (mindex = 0; mindex <= suffix; mindex++) 2213 g->matchjump[mindex] = MIN(g->matchjump[mindex], 2214 g->mlen + suffix - mindex); 2215 2216 ssuffix = pmatches[suffix]; 2217 while (suffix < g->mlen) { 2218 while (suffix <= ssuffix && suffix < g->mlen) { 2219 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2220 g->mlen + ssuffix - suffix); 2221 suffix++; 2222 } 2223 if (suffix < g->mlen) 2224 ssuffix = pmatches[ssuffix]; 2225 } 2226 2227 free(pmatches); 2228 } 2229 2230 /* 2231 - pluscount - count + nesting 2232 == static sopno pluscount(struct parse *p, struct re_guts *g); 2233 */ 2234 static sopno /* nesting depth */ 2235 pluscount(struct parse *p, struct re_guts *g) 2236 { 2237 sop *scan; 2238 sop s; 2239 sopno plusnest = 0; 2240 sopno maxnest = 0; 2241 2242 if (p->error != 0) 2243 return(0); /* there may not be an OEND */ 2244 2245 scan = g->strip + 1; 2246 do { 2247 s = *scan++; 2248 switch (OP(s)) { 2249 case OPLUS_: 2250 plusnest++; 2251 break; 2252 case O_PLUS: 2253 if (plusnest > maxnest) 2254 maxnest = plusnest; 2255 plusnest--; 2256 break; 2257 } 2258 } while (OP(s) != OEND); 2259 if (plusnest != 0) 2260 g->iflags |= BAD; 2261 return(maxnest); 2262 } 2263