1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Copyright (c) 2011 The FreeBSD Foundation 9 * 10 * Portions of this software were developed by David Chisnall 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * Henry Spencer. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 */ 40 41 #include <sys/types.h> 42 #include <stdio.h> 43 #include <string.h> 44 #include <ctype.h> 45 #include <limits.h> 46 #include <stdlib.h> 47 #include <regex.h> 48 #include <stdbool.h> 49 #include <wchar.h> 50 #include <wctype.h> 51 52 #ifndef LIBREGEX 53 #include "collate.h" 54 #endif 55 56 #include "utils.h" 57 #include "regex2.h" 58 59 #include "cname.h" 60 61 /* 62 * Branching context, used to keep track of branch state for all of the branch- 63 * aware functions. In addition to keeping track of branch positions for the 64 * p_branch_* functions, we use this to simplify some clumsiness in BREs for 65 * detection of whether ^ is acting as an anchor or being used erroneously and 66 * also for whether we're in a sub-expression or not. 67 */ 68 struct branchc { 69 sopno start; 70 sopno back; 71 sopno fwd; 72 73 int nbranch; 74 int nchain; 75 bool outer; 76 bool terminate; 77 }; 78 79 /* 80 * parse structure, passed up and down to avoid global variables and 81 * other clumsinesses 82 */ 83 struct parse { 84 const char *next; /* next character in RE */ 85 const char *end; /* end of string (-> NUL normally) */ 86 int error; /* has an error been seen? */ 87 int gnuext; 88 sop *strip; /* malloced strip */ 89 sopno ssize; /* malloced strip size (allocated) */ 90 sopno slen; /* malloced strip length (used) */ 91 int ncsalloc; /* number of csets allocated */ 92 struct re_guts *g; 93 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 94 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 95 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 96 bool allowbranch; /* can this expression branch? */ 97 bool bre; /* convenience; is this a BRE? */ 98 int pflags; /* other parsing flags -- legacy escapes? */ 99 bool (*parse_expr)(struct parse *, struct branchc *); 100 void (*pre_parse)(struct parse *, struct branchc *); 101 void (*post_parse)(struct parse *, struct branchc *); 102 }; 103 104 #define PFLAG_LEGACY_ESC 0x00000001 105 106 /* ========= begin header generated by ./mkh ========= */ 107 #ifdef __cplusplus 108 extern "C" { 109 #endif 110 111 /* === regcomp.c === */ 112 static bool p_ere_exp(struct parse *p, struct branchc *bc); 113 static void p_str(struct parse *p); 114 static int p_branch_eat_delim(struct parse *p, struct branchc *bc); 115 static void p_branch_ins_offset(struct parse *p, struct branchc *bc); 116 static void p_branch_fix_tail(struct parse *p, struct branchc *bc); 117 static bool p_branch_empty(struct parse *p, struct branchc *bc); 118 static bool p_branch_do(struct parse *p, struct branchc *bc); 119 static void p_bre_pre_parse(struct parse *p, struct branchc *bc); 120 static void p_bre_post_parse(struct parse *p, struct branchc *bc); 121 static void p_re(struct parse *p, int end1, int end2); 122 static bool p_simp_re(struct parse *p, struct branchc *bc); 123 static int p_count(struct parse *p); 124 static void p_bracket(struct parse *p); 125 static int p_range_cmp(wchar_t c1, wchar_t c2); 126 static void p_b_term(struct parse *p, cset *cs); 127 static int p_b_pseudoclass(struct parse *p, char c); 128 static void p_b_cclass(struct parse *p, cset *cs); 129 static void p_b_cclass_named(struct parse *p, cset *cs, const char[]); 130 static void p_b_eclass(struct parse *p, cset *cs); 131 static wint_t p_b_symbol(struct parse *p); 132 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 133 static bool may_escape(struct parse *p, const wint_t ch); 134 static wint_t othercase(wint_t ch); 135 static void bothcases(struct parse *p, wint_t ch); 136 static void ordinary(struct parse *p, wint_t ch); 137 static void nonnewline(struct parse *p); 138 static void repeat(struct parse *p, sopno start, int from, int to); 139 static int seterr(struct parse *p, int e); 140 static cset *allocset(struct parse *p); 141 static void freeset(struct parse *p, cset *cs); 142 static void CHadd(struct parse *p, cset *cs, wint_t ch); 143 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 144 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 145 static wint_t singleton(cset *cs); 146 static sopno dupl(struct parse *p, sopno start, sopno finish); 147 static void doemit(struct parse *p, sop op, size_t opnd); 148 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 149 static void dofwd(struct parse *p, sopno pos, sop value); 150 static int enlarge(struct parse *p, sopno size); 151 static void stripsnug(struct parse *p, struct re_guts *g); 152 static void findmust(struct parse *p, struct re_guts *g); 153 static int altoffset(sop *scan, int offset); 154 static void computejumps(struct parse *p, struct re_guts *g); 155 static void computematchjumps(struct parse *p, struct re_guts *g); 156 static sopno pluscount(struct parse *p, struct re_guts *g); 157 static wint_t wgetnext(struct parse *p); 158 159 #ifdef __cplusplus 160 } 161 #endif 162 /* ========= end header generated by ./mkh ========= */ 163 164 static char nuls[10]; /* place to point scanner in event of error */ 165 166 /* 167 * macros for use with parse structure 168 * BEWARE: these know that the parse structure is named `p' !!! 169 */ 170 #define PEEK() (*p->next) 171 #define PEEK2() (*(p->next+1)) 172 #define MORE() (p->end - p->next > 0) 173 #define MORE2() (p->end - p->next > 1) 174 #define SEE(c) (MORE() && PEEK() == (c)) 175 #define SEETWO(a, b) (MORE2() && PEEK() == (a) && PEEK2() == (b)) 176 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) 177 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 178 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 179 #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a)) 180 #define NEXT() (p->next++) 181 #define NEXT2() (p->next += 2) 182 #define NEXTn(n) (p->next += (n)) 183 #define GETNEXT() (*p->next++) 184 #define WGETNEXT() wgetnext(p) 185 #define SETERROR(e) seterr(p, (e)) 186 #define REQUIRE(co, e) ((co) || SETERROR(e)) 187 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 188 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 189 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 190 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 191 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 192 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 193 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 194 #define HERE() (p->slen) 195 #define THERE() (p->slen - 1) 196 #define THERETHERE() (p->slen - 2) 197 #define DROP(n) (p->slen -= (n)) 198 199 /* Macro used by computejump()/computematchjump() */ 200 #define MIN(a,b) ((a)<(b)?(a):(b)) 201 202 static int /* 0 success, otherwise REG_something */ 203 regcomp_internal(regex_t * __restrict preg, 204 const char * __restrict pattern, 205 int cflags, int pflags) 206 { 207 struct parse pa; 208 struct re_guts *g; 209 struct parse *p = &pa; 210 int i; 211 size_t len; 212 size_t maxlen; 213 #ifdef REDEBUG 214 # define GOODFLAGS(f) (f) 215 #else 216 # define GOODFLAGS(f) ((f)&~REG_DUMP) 217 #endif 218 219 cflags = GOODFLAGS(cflags); 220 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 221 return(REG_INVARG); 222 223 if (cflags®_PEND) { 224 if (preg->re_endp < pattern) 225 return(REG_INVARG); 226 len = preg->re_endp - pattern; 227 } else 228 len = strlen(pattern); 229 230 /* do the mallocs early so failure handling is easy */ 231 g = (struct re_guts *)malloc(sizeof(struct re_guts)); 232 if (g == NULL) 233 return(REG_ESPACE); 234 /* 235 * Limit the pattern space to avoid a 32-bit overflow on buffer 236 * extension. Also avoid any signed overflow in case of conversion 237 * so make the real limit based on a 31-bit overflow. 238 * 239 * Likely not applicable on 64-bit systems but handle the case 240 * generically (who are we to stop people from using ~715MB+ 241 * patterns?). 242 */ 243 maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3; 244 if (len >= maxlen) { 245 free((char *)g); 246 return(REG_ESPACE); 247 } 248 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 249 assert(p->ssize >= len); 250 251 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 252 p->slen = 0; 253 if (p->strip == NULL) { 254 free((char *)g); 255 return(REG_ESPACE); 256 } 257 258 /* set things up */ 259 p->g = g; 260 p->next = pattern; /* convenience; we do not modify it */ 261 p->end = p->next + len; 262 p->error = 0; 263 p->ncsalloc = 0; 264 p->pflags = pflags; 265 for (i = 0; i < NPAREN; i++) { 266 p->pbegin[i] = 0; 267 p->pend[i] = 0; 268 } 269 #ifdef LIBREGEX 270 if (cflags®_POSIX) { 271 p->gnuext = false; 272 p->allowbranch = (cflags & REG_EXTENDED) != 0; 273 } else 274 p->gnuext = p->allowbranch = true; 275 #else 276 p->gnuext = false; 277 p->allowbranch = (cflags & REG_EXTENDED) != 0; 278 #endif 279 if (cflags & REG_EXTENDED) { 280 p->bre = false; 281 p->parse_expr = p_ere_exp; 282 p->pre_parse = NULL; 283 p->post_parse = NULL; 284 } else { 285 p->bre = true; 286 p->parse_expr = p_simp_re; 287 p->pre_parse = p_bre_pre_parse; 288 p->post_parse = p_bre_post_parse; 289 } 290 g->sets = NULL; 291 g->ncsets = 0; 292 g->cflags = cflags; 293 g->iflags = 0; 294 g->nbol = 0; 295 g->neol = 0; 296 g->must = NULL; 297 g->moffset = -1; 298 g->charjump = NULL; 299 g->matchjump = NULL; 300 g->mlen = 0; 301 g->nsub = 0; 302 g->backrefs = 0; 303 304 /* do it */ 305 EMIT(OEND, 0); 306 g->firststate = THERE(); 307 if (cflags & REG_NOSPEC) 308 p_str(p); 309 else 310 p_re(p, OUT, OUT); 311 EMIT(OEND, 0); 312 g->laststate = THERE(); 313 314 /* tidy up loose ends and fill things in */ 315 stripsnug(p, g); 316 findmust(p, g); 317 /* only use Boyer-Moore algorithm if the pattern is bigger 318 * than three characters 319 */ 320 if(g->mlen > 3) { 321 computejumps(p, g); 322 computematchjumps(p, g); 323 if(g->matchjump == NULL && g->charjump != NULL) { 324 free(&g->charjump[CHAR_MIN]); 325 g->charjump = NULL; 326 } 327 } 328 g->nplus = pluscount(p, g); 329 g->magic = MAGIC2; 330 preg->re_nsub = g->nsub; 331 preg->re_g = g; 332 preg->re_magic = MAGIC1; 333 #ifndef REDEBUG 334 /* not debugging, so can't rely on the assert() in regexec() */ 335 if (g->iflags&BAD) 336 SETERROR(REG_ASSERT); 337 #endif 338 339 /* win or lose, we're done */ 340 if (p->error != 0) /* lose */ 341 regfree(preg); 342 return(p->error); 343 } 344 345 /* 346 - regcomp - interface for parser and compilation 347 = extern int regcomp(regex_t *, const char *, int); 348 = #define REG_BASIC 0000 349 = #define REG_EXTENDED 0001 350 = #define REG_ICASE 0002 351 = #define REG_NOSUB 0004 352 = #define REG_NEWLINE 0010 353 = #define REG_NOSPEC 0020 354 = #define REG_PEND 0040 355 = #define REG_DUMP 0200 356 */ 357 int /* 0 success, otherwise REG_something */ 358 regcomp(regex_t * __restrict preg, 359 const char * __restrict pattern, 360 int cflags) 361 { 362 363 return (regcomp_internal(preg, pattern, cflags, 0)); 364 } 365 366 #ifndef LIBREGEX 367 /* 368 * Legacy interface that requires more lax escaping behavior. 369 */ 370 int 371 freebsd12_regcomp(regex_t * __restrict preg, 372 const char * __restrict pattern, 373 int cflags, int pflags) 374 { 375 376 return (regcomp_internal(preg, pattern, cflags, PFLAG_LEGACY_ESC)); 377 } 378 379 __sym_compat(regcomp, freebsd12_regcomp, FBSD_1.0); 380 #endif /* !LIBREGEX */ 381 382 /* 383 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, 384 - return whether we should terminate or not 385 == static bool p_ere_exp(struct parse *p); 386 */ 387 static bool 388 p_ere_exp(struct parse *p, struct branchc *bc) 389 { 390 char c; 391 wint_t wc; 392 sopno pos; 393 int count; 394 int count2; 395 #ifdef LIBREGEX 396 int i; 397 int handled; 398 #endif 399 sopno subno; 400 int wascaret = 0; 401 402 (void)bc; 403 assert(MORE()); /* caller should have ensured this */ 404 c = GETNEXT(); 405 406 #ifdef LIBREGEX 407 handled = 0; 408 #endif 409 pos = HERE(); 410 switch (c) { 411 case '(': 412 (void)REQUIRE(MORE(), REG_EPAREN); 413 p->g->nsub++; 414 subno = p->g->nsub; 415 if (subno < NPAREN) 416 p->pbegin[subno] = HERE(); 417 EMIT(OLPAREN, subno); 418 if (!SEE(')')) 419 p_re(p, ')', IGN); 420 if (subno < NPAREN) { 421 p->pend[subno] = HERE(); 422 assert(p->pend[subno] != 0); 423 } 424 EMIT(ORPAREN, subno); 425 (void)MUSTEAT(')', REG_EPAREN); 426 break; 427 #ifndef POSIX_MISTAKE 428 case ')': /* happens only if no current unmatched ( */ 429 /* 430 * You may ask, why the ifndef? Because I didn't notice 431 * this until slightly too late for 1003.2, and none of the 432 * other 1003.2 regular-expression reviewers noticed it at 433 * all. So an unmatched ) is legal POSIX, at least until 434 * we can get it fixed. 435 */ 436 SETERROR(REG_EPAREN); 437 break; 438 #endif 439 case '^': 440 EMIT(OBOL, 0); 441 p->g->iflags |= USEBOL; 442 p->g->nbol++; 443 wascaret = 1; 444 break; 445 case '$': 446 EMIT(OEOL, 0); 447 p->g->iflags |= USEEOL; 448 p->g->neol++; 449 break; 450 case '|': 451 SETERROR(REG_EMPTY); 452 break; 453 case '*': 454 case '+': 455 case '?': 456 #ifndef NO_STRICT_REGEX 457 case '{': 458 #endif 459 SETERROR(REG_BADRPT); 460 break; 461 case '.': 462 if (p->g->cflags®_NEWLINE) 463 nonnewline(p); 464 else 465 EMIT(OANY, 0); 466 break; 467 case '[': 468 p_bracket(p); 469 break; 470 case '\\': 471 (void)REQUIRE(MORE(), REG_EESCAPE); 472 wc = WGETNEXT(); 473 #ifdef LIBREGEX 474 if (p->gnuext) { 475 handled = 1; 476 switch (wc) { 477 case '`': 478 EMIT(OBOS, 0); 479 break; 480 case '\'': 481 EMIT(OEOS, 0); 482 break; 483 case 'B': 484 EMIT(ONWBND, 0); 485 break; 486 case 'b': 487 EMIT(OWBND, 0); 488 break; 489 case 'W': 490 case 'w': 491 case 'S': 492 case 's': 493 p_b_pseudoclass(p, wc); 494 break; 495 case '1': 496 case '2': 497 case '3': 498 case '4': 499 case '5': 500 case '6': 501 case '7': 502 case '8': 503 case '9': 504 i = wc - '0'; 505 assert(i < NPAREN); 506 if (p->pend[i] != 0) { 507 assert(i <= p->g->nsub); 508 EMIT(OBACK_, i); 509 assert(p->pbegin[i] != 0); 510 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 511 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 512 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 513 EMIT(O_BACK, i); 514 } else 515 SETERROR(REG_ESUBREG); 516 p->g->backrefs = 1; 517 break; 518 default: 519 handled = 0; 520 } 521 /* Don't proceed to the POSIX bits if we've already handled it */ 522 if (handled) 523 break; 524 } 525 #endif 526 switch (wc) { 527 case '<': 528 EMIT(OBOW, 0); 529 break; 530 case '>': 531 EMIT(OEOW, 0); 532 break; 533 default: 534 if (may_escape(p, wc)) 535 ordinary(p, wc); 536 else 537 SETERROR(REG_EESCAPE); 538 break; 539 } 540 break; 541 #ifdef NO_STRICT_REGEX 542 case '{': /* okay as ordinary except if digit follows */ 543 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 544 /* FALLTHROUGH */ 545 #endif 546 default: 547 if (p->error != 0) 548 return (false); 549 p->next--; 550 wc = WGETNEXT(); 551 ordinary(p, wc); 552 break; 553 } 554 555 if (!MORE()) 556 return (false); 557 c = PEEK(); 558 /* we call { a repetition if followed by a digit */ 559 if (!( c == '*' || c == '+' || c == '?' || 560 #ifdef NO_STRICT_REGEX 561 (c == '{' && MORE2() && isdigit((uch)PEEK2())) 562 #else 563 c == '{' 564 #endif 565 )) 566 return (false); /* no repetition, we're done */ 567 #ifndef NO_STRICT_REGEX 568 else if (c == '{') 569 (void)REQUIRE(MORE2() && \ 570 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); 571 #endif 572 NEXT(); 573 574 (void)REQUIRE(!wascaret, REG_BADRPT); 575 switch (c) { 576 case '*': /* implemented as +? */ 577 /* this case does not require the (y|) trick, noKLUDGE */ 578 INSERT(OPLUS_, pos); 579 ASTERN(O_PLUS, pos); 580 INSERT(OQUEST_, pos); 581 ASTERN(O_QUEST, pos); 582 break; 583 case '+': 584 INSERT(OPLUS_, pos); 585 ASTERN(O_PLUS, pos); 586 break; 587 case '?': 588 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 589 INSERT(OCH_, pos); /* offset slightly wrong */ 590 ASTERN(OOR1, pos); /* this one's right */ 591 AHEAD(pos); /* fix the OCH_ */ 592 EMIT(OOR2, 0); /* offset very wrong... */ 593 AHEAD(THERE()); /* ...so fix it */ 594 ASTERN(O_CH, THERETHERE()); 595 break; 596 case '{': 597 count = p_count(p); 598 if (EAT(',')) { 599 if (isdigit((uch)PEEK())) { 600 count2 = p_count(p); 601 (void)REQUIRE(count <= count2, REG_BADBR); 602 } else /* single number with comma */ 603 count2 = INFINITY; 604 } else /* just a single number */ 605 count2 = count; 606 repeat(p, pos, count, count2); 607 if (!EAT('}')) { /* error heuristics */ 608 while (MORE() && PEEK() != '}') 609 NEXT(); 610 (void)REQUIRE(MORE(), REG_EBRACE); 611 SETERROR(REG_BADBR); 612 } 613 break; 614 } 615 616 if (!MORE()) 617 return (false); 618 c = PEEK(); 619 if (!( c == '*' || c == '+' || c == '?' || 620 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 621 return (false); 622 SETERROR(REG_BADRPT); 623 return (false); 624 } 625 626 /* 627 - p_str - string (no metacharacters) "parser" 628 == static void p_str(struct parse *p); 629 */ 630 static void 631 p_str(struct parse *p) 632 { 633 (void)REQUIRE(MORE(), REG_EMPTY); 634 while (MORE()) 635 ordinary(p, WGETNEXT()); 636 } 637 638 /* 639 * Eat consecutive branch delimiters for the kind of expression that we are 640 * parsing, return the number of delimiters that we ate. 641 */ 642 static int 643 p_branch_eat_delim(struct parse *p, struct branchc *bc) 644 { 645 int nskip; 646 647 (void)bc; 648 nskip = 0; 649 while (EATSPEC('|')) 650 ++nskip; 651 return (nskip); 652 } 653 654 /* 655 * Insert necessary branch book-keeping operations. This emits a 656 * bogus 'next' offset, since we still have more to parse 657 */ 658 static void 659 p_branch_ins_offset(struct parse *p, struct branchc *bc) 660 { 661 662 if (bc->nbranch == 0) { 663 INSERT(OCH_, bc->start); /* offset is wrong */ 664 bc->fwd = bc->start; 665 bc->back = bc->start; 666 } 667 668 ASTERN(OOR1, bc->back); 669 bc->back = THERE(); 670 AHEAD(bc->fwd); /* fix previous offset */ 671 bc->fwd = HERE(); 672 EMIT(OOR2, 0); /* offset is very wrong */ 673 ++bc->nbranch; 674 } 675 676 /* 677 * Fix the offset of the tail branch, if we actually had any branches. 678 * This is to correct the bogus placeholder offset that we use. 679 */ 680 static void 681 p_branch_fix_tail(struct parse *p, struct branchc *bc) 682 { 683 684 /* Fix bogus offset at the tail if we actually have branches */ 685 if (bc->nbranch > 0) { 686 AHEAD(bc->fwd); 687 ASTERN(O_CH, bc->back); 688 } 689 } 690 691 /* 692 * Signal to the parser that an empty branch has been encountered; this will, 693 * in the future, be used to allow for more permissive behavior with empty 694 * branches. The return value should indicate whether parsing may continue 695 * or not. 696 */ 697 static bool 698 p_branch_empty(struct parse *p, struct branchc *bc) 699 { 700 701 (void)bc; 702 SETERROR(REG_EMPTY); 703 return (false); 704 } 705 706 /* 707 * Take care of any branching requirements. This includes inserting the 708 * appropriate branching instructions as well as eating all of the branch 709 * delimiters until we either run out of pattern or need to parse more pattern. 710 */ 711 static bool 712 p_branch_do(struct parse *p, struct branchc *bc) 713 { 714 int ate = 0; 715 716 ate = p_branch_eat_delim(p, bc); 717 if (ate == 0) 718 return (false); 719 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) 720 /* 721 * Halt parsing only if we have an empty branch and p_branch_empty 722 * indicates that we must not continue. In the future, this will not 723 * necessarily be an error. 724 */ 725 return (false); 726 p_branch_ins_offset(p, bc); 727 728 return (true); 729 } 730 731 static void 732 p_bre_pre_parse(struct parse *p, struct branchc *bc) 733 { 734 735 (void) bc; 736 /* 737 * Does not move cleanly into expression parser because of 738 * ordinary interpration of * at the beginning position of 739 * an expression. 740 */ 741 if (EAT('^')) { 742 EMIT(OBOL, 0); 743 p->g->iflags |= USEBOL; 744 p->g->nbol++; 745 } 746 } 747 748 static void 749 p_bre_post_parse(struct parse *p, struct branchc *bc) 750 { 751 752 /* Expression is terminating due to EOL token */ 753 if (bc->terminate) { 754 DROP(1); 755 EMIT(OEOL, 0); 756 p->g->iflags |= USEEOL; 757 p->g->neol++; 758 } 759 } 760 761 /* 762 - p_re - Top level parser, concatenation and BRE anchoring 763 == static void p_re(struct parse *p, int end1, int end2); 764 * Giving end1 as OUT essentially eliminates the end1/end2 check. 765 * 766 * This implementation is a bit of a kludge, in that a trailing $ is first 767 * taken as an ordinary character and then revised to be an anchor. 768 * The amount of lookahead needed to avoid this kludge is excessive. 769 */ 770 static void 771 p_re(struct parse *p, 772 int end1, /* first terminating character */ 773 int end2) /* second terminating character; ignored for EREs */ 774 { 775 struct branchc bc; 776 777 bc.nbranch = 0; 778 if (end1 == OUT && end2 == OUT) 779 bc.outer = true; 780 else 781 bc.outer = false; 782 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) 783 for (;;) { 784 bc.start = HERE(); 785 bc.nchain = 0; 786 bc.terminate = false; 787 if (p->pre_parse != NULL) 788 p->pre_parse(p, &bc); 789 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) { 790 bc.terminate = p->parse_expr(p, &bc); 791 ++bc.nchain; 792 } 793 if (p->post_parse != NULL) 794 p->post_parse(p, &bc); 795 (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY); 796 #ifdef LIBREGEX 797 if (HERE() == bc.start && !p_branch_empty(p, &bc)) 798 break; 799 #endif 800 if (!p->allowbranch) 801 break; 802 /* 803 * p_branch_do's return value indicates whether we should 804 * continue parsing or not. This is both for correctness and 805 * a slight optimization, because it will check if we've 806 * encountered an empty branch or the end of the string 807 * immediately following a branch delimiter. 808 */ 809 if (!p_branch_do(p, &bc)) 810 break; 811 } 812 #undef SEE_END 813 if (p->allowbranch) 814 p_branch_fix_tail(p, &bc); 815 assert(!MORE() || SEE(end1)); 816 } 817 818 /* 819 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 820 == static bool p_simp_re(struct parse *p, struct branchc *bc); 821 */ 822 static bool /* was the simple RE an unbackslashed $? */ 823 p_simp_re(struct parse *p, struct branchc *bc) 824 { 825 int c; 826 int cc; /* convenient/control character */ 827 int count; 828 int count2; 829 sopno pos; 830 bool handled; 831 int i; 832 wint_t wc; 833 sopno subno; 834 # define BACKSL (1<<CHAR_BIT) 835 836 pos = HERE(); /* repetition op, if any, covers from here */ 837 handled = false; 838 839 assert(MORE()); /* caller should have ensured this */ 840 c = (uch)GETNEXT(); 841 if (c == '\\') { 842 (void)REQUIRE(MORE(), REG_EESCAPE); 843 cc = (uch)GETNEXT(); 844 c = BACKSL | cc; 845 #ifdef LIBREGEX 846 if (p->gnuext) { 847 handled = true; 848 switch (c) { 849 case BACKSL|'`': 850 EMIT(OBOS, 0); 851 break; 852 case BACKSL|'\'': 853 EMIT(OEOS, 0); 854 break; 855 case BACKSL|'B': 856 EMIT(ONWBND, 0); 857 break; 858 case BACKSL|'b': 859 EMIT(OWBND, 0); 860 break; 861 case BACKSL|'W': 862 case BACKSL|'w': 863 case BACKSL|'S': 864 case BACKSL|'s': 865 p_b_pseudoclass(p, cc); 866 break; 867 default: 868 handled = false; 869 } 870 } 871 #endif 872 } 873 if (!handled) { 874 switch (c) { 875 case '.': 876 if (p->g->cflags®_NEWLINE) 877 nonnewline(p); 878 else 879 EMIT(OANY, 0); 880 break; 881 case '[': 882 p_bracket(p); 883 break; 884 case BACKSL|'<': 885 EMIT(OBOW, 0); 886 break; 887 case BACKSL|'>': 888 EMIT(OEOW, 0); 889 break; 890 case BACKSL|'{': 891 SETERROR(REG_BADRPT); 892 break; 893 case BACKSL|'(': 894 p->g->nsub++; 895 subno = p->g->nsub; 896 if (subno < NPAREN) 897 p->pbegin[subno] = HERE(); 898 EMIT(OLPAREN, subno); 899 /* the MORE here is an error heuristic */ 900 if (MORE() && !SEETWO('\\', ')')) 901 p_re(p, '\\', ')'); 902 if (subno < NPAREN) { 903 p->pend[subno] = HERE(); 904 assert(p->pend[subno] != 0); 905 } 906 EMIT(ORPAREN, subno); 907 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 908 break; 909 case BACKSL|')': /* should not get here -- must be user */ 910 #ifdef NO_STRICT_REGEX 911 case BACKSL|'}': 912 #endif 913 SETERROR(REG_EPAREN); 914 break; 915 case BACKSL|'1': 916 case BACKSL|'2': 917 case BACKSL|'3': 918 case BACKSL|'4': 919 case BACKSL|'5': 920 case BACKSL|'6': 921 case BACKSL|'7': 922 case BACKSL|'8': 923 case BACKSL|'9': 924 i = (c&~BACKSL) - '0'; 925 assert(i < NPAREN); 926 if (p->pend[i] != 0) { 927 assert(i <= p->g->nsub); 928 EMIT(OBACK_, i); 929 assert(p->pbegin[i] != 0); 930 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 931 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 932 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 933 EMIT(O_BACK, i); 934 } else 935 SETERROR(REG_ESUBREG); 936 p->g->backrefs = 1; 937 break; 938 case '*': 939 /* 940 * Ordinary if used as the first character beyond BOL anchor of 941 * a (sub-)expression, counts as a bad repetition operator if it 942 * appears otherwise. 943 */ 944 (void)REQUIRE(bc->nchain == 0, REG_BADRPT); 945 /* FALLTHROUGH */ 946 default: 947 if (p->error != 0) 948 return (false); /* Definitely not $... */ 949 p->next--; 950 wc = WGETNEXT(); 951 if ((c & BACKSL) == 0 || may_escape(p, wc)) 952 ordinary(p, wc); 953 else 954 SETERROR(REG_EESCAPE); 955 break; 956 } 957 } 958 959 if (EAT('*')) { /* implemented as +? */ 960 /* this case does not require the (y|) trick, noKLUDGE */ 961 INSERT(OPLUS_, pos); 962 ASTERN(O_PLUS, pos); 963 INSERT(OQUEST_, pos); 964 ASTERN(O_QUEST, pos); 965 #ifdef LIBREGEX 966 } else if (p->gnuext && EATTWO('\\', '?')) { 967 INSERT(OQUEST_, pos); 968 ASTERN(O_QUEST, pos); 969 } else if (p->gnuext && EATTWO('\\', '+')) { 970 INSERT(OPLUS_, pos); 971 ASTERN(O_PLUS, pos); 972 #endif 973 } else if (EATTWO('\\', '{')) { 974 count = p_count(p); 975 if (EAT(',')) { 976 if (MORE() && isdigit((uch)PEEK())) { 977 count2 = p_count(p); 978 (void)REQUIRE(count <= count2, REG_BADBR); 979 } else /* single number with comma */ 980 count2 = INFINITY; 981 } else /* just a single number */ 982 count2 = count; 983 repeat(p, pos, count, count2); 984 if (!EATTWO('\\', '}')) { /* error heuristics */ 985 while (MORE() && !SEETWO('\\', '}')) 986 NEXT(); 987 (void)REQUIRE(MORE(), REG_EBRACE); 988 SETERROR(REG_BADBR); 989 } 990 } else if (c == '$') /* $ (but not \$) ends it */ 991 return (true); 992 993 return (false); 994 } 995 996 /* 997 - p_count - parse a repetition count 998 == static int p_count(struct parse *p); 999 */ 1000 static int /* the value */ 1001 p_count(struct parse *p) 1002 { 1003 int count = 0; 1004 int ndigits = 0; 1005 1006 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 1007 count = count*10 + ((uch)GETNEXT() - '0'); 1008 ndigits++; 1009 } 1010 1011 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 1012 return(count); 1013 } 1014 1015 /* 1016 - p_bracket - parse a bracketed character list 1017 == static void p_bracket(struct parse *p); 1018 */ 1019 static void 1020 p_bracket(struct parse *p) 1021 { 1022 cset *cs; 1023 wint_t ch; 1024 1025 /* Dept of Truly Sickening Special-Case Kludges */ 1026 if (p->end - p->next > 5) { 1027 if (strncmp(p->next, "[:<:]]", 6) == 0) { 1028 EMIT(OBOW, 0); 1029 NEXTn(6); 1030 return; 1031 } 1032 if (strncmp(p->next, "[:>:]]", 6) == 0) { 1033 EMIT(OEOW, 0); 1034 NEXTn(6); 1035 return; 1036 } 1037 } 1038 1039 if ((cs = allocset(p)) == NULL) 1040 return; 1041 1042 if (p->g->cflags®_ICASE) 1043 cs->icase = 1; 1044 if (EAT('^')) 1045 cs->invert = 1; 1046 if (EAT(']')) 1047 CHadd(p, cs, ']'); 1048 else if (EAT('-')) 1049 CHadd(p, cs, '-'); 1050 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 1051 p_b_term(p, cs); 1052 if (EAT('-')) 1053 CHadd(p, cs, '-'); 1054 (void)MUSTEAT(']', REG_EBRACK); 1055 1056 if (p->error != 0) /* don't mess things up further */ 1057 return; 1058 1059 if (cs->invert && p->g->cflags®_NEWLINE) 1060 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 1061 1062 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 1063 ordinary(p, ch); 1064 freeset(p, cs); 1065 } else 1066 EMIT(OANYOF, (int)(cs - p->g->sets)); 1067 } 1068 1069 static int 1070 p_range_cmp(wchar_t c1, wchar_t c2) 1071 { 1072 #ifndef LIBREGEX 1073 return __wcollate_range_cmp(c1, c2); 1074 #else 1075 /* Copied from libc/collate __wcollate_range_cmp */ 1076 wchar_t s1[2], s2[2]; 1077 1078 s1[0] = c1; 1079 s1[1] = L'\0'; 1080 s2[0] = c2; 1081 s2[1] = L'\0'; 1082 return (wcscoll(s1, s2)); 1083 #endif 1084 } 1085 1086 /* 1087 - p_b_term - parse one term of a bracketed character list 1088 == static void p_b_term(struct parse *p, cset *cs); 1089 */ 1090 static void 1091 p_b_term(struct parse *p, cset *cs) 1092 { 1093 char c; 1094 wint_t start, finish; 1095 wint_t i; 1096 #ifndef LIBREGEX 1097 struct xlocale_collate *table = 1098 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 1099 #endif 1100 /* classify what we've got */ 1101 switch ((MORE()) ? PEEK() : '\0') { 1102 case '[': 1103 c = (MORE2()) ? PEEK2() : '\0'; 1104 break; 1105 case '-': 1106 SETERROR(REG_ERANGE); 1107 return; /* NOTE RETURN */ 1108 default: 1109 c = '\0'; 1110 break; 1111 } 1112 1113 switch (c) { 1114 case ':': /* character class */ 1115 NEXT2(); 1116 (void)REQUIRE(MORE(), REG_EBRACK); 1117 c = PEEK(); 1118 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 1119 p_b_cclass(p, cs); 1120 (void)REQUIRE(MORE(), REG_EBRACK); 1121 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 1122 break; 1123 case '=': /* equivalence class */ 1124 NEXT2(); 1125 (void)REQUIRE(MORE(), REG_EBRACK); 1126 c = PEEK(); 1127 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 1128 p_b_eclass(p, cs); 1129 (void)REQUIRE(MORE(), REG_EBRACK); 1130 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 1131 break; 1132 default: /* symbol, ordinary character, or range */ 1133 start = p_b_symbol(p); 1134 if (SEE('-') && MORE2() && PEEK2() != ']') { 1135 /* range */ 1136 NEXT(); 1137 if (EAT('-')) 1138 finish = '-'; 1139 else 1140 finish = p_b_symbol(p); 1141 } else 1142 finish = start; 1143 if (start == finish) 1144 CHadd(p, cs, start); 1145 else { 1146 #ifndef LIBREGEX 1147 if (table->__collate_load_error || MB_CUR_MAX > 1) { 1148 #else 1149 if (MB_CUR_MAX > 1) { 1150 #endif 1151 (void)REQUIRE(start <= finish, REG_ERANGE); 1152 CHaddrange(p, cs, start, finish); 1153 } else { 1154 (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE); 1155 for (i = 0; i <= UCHAR_MAX; i++) { 1156 if (p_range_cmp(start, i) <= 0 && 1157 p_range_cmp(i, finish) <= 0 ) 1158 CHadd(p, cs, i); 1159 } 1160 } 1161 } 1162 break; 1163 } 1164 } 1165 1166 /* 1167 - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S) 1168 == static int p_b_pseudoclass(struct parse *p, char c) 1169 */ 1170 static int 1171 p_b_pseudoclass(struct parse *p, char c) { 1172 cset *cs; 1173 1174 if ((cs = allocset(p)) == NULL) 1175 return(0); 1176 1177 if (p->g->cflags®_ICASE) 1178 cs->icase = 1; 1179 1180 switch (c) { 1181 case 'W': 1182 cs->invert = 1; 1183 /* PASSTHROUGH */ 1184 case 'w': 1185 p_b_cclass_named(p, cs, "alnum"); 1186 break; 1187 case 'S': 1188 cs->invert = 1; 1189 /* PASSTHROUGH */ 1190 case 's': 1191 p_b_cclass_named(p, cs, "space"); 1192 break; 1193 default: 1194 return(0); 1195 } 1196 1197 EMIT(OANYOF, (int)(cs - p->g->sets)); 1198 return(1); 1199 } 1200 1201 /* 1202 - p_b_cclass - parse a character-class name and deal with it 1203 == static void p_b_cclass(struct parse *p, cset *cs); 1204 */ 1205 static void 1206 p_b_cclass(struct parse *p, cset *cs) 1207 { 1208 const char *sp = p->next; 1209 size_t len; 1210 char clname[16]; 1211 1212 while (MORE() && isalpha((uch)PEEK())) 1213 NEXT(); 1214 len = p->next - sp; 1215 if (len >= sizeof(clname) - 1) { 1216 SETERROR(REG_ECTYPE); 1217 return; 1218 } 1219 memcpy(clname, sp, len); 1220 clname[len] = '\0'; 1221 1222 p_b_cclass_named(p, cs, clname); 1223 } 1224 /* 1225 - p_b_cclass_named - deal with a named character class 1226 == static void p_b_cclass_named(struct parse *p, cset *cs, const char []); 1227 */ 1228 static void 1229 p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) { 1230 wctype_t wct; 1231 1232 if ((wct = wctype(clname)) == 0) { 1233 SETERROR(REG_ECTYPE); 1234 return; 1235 } 1236 CHaddtype(p, cs, wct); 1237 } 1238 1239 /* 1240 - p_b_eclass - parse an equivalence-class name and deal with it 1241 == static void p_b_eclass(struct parse *p, cset *cs); 1242 * 1243 * This implementation is incomplete. xxx 1244 */ 1245 static void 1246 p_b_eclass(struct parse *p, cset *cs) 1247 { 1248 wint_t c; 1249 1250 c = p_b_coll_elem(p, '='); 1251 CHadd(p, cs, c); 1252 } 1253 1254 /* 1255 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 1256 == static wint_t p_b_symbol(struct parse *p); 1257 */ 1258 static wint_t /* value of symbol */ 1259 p_b_symbol(struct parse *p) 1260 { 1261 wint_t value; 1262 1263 (void)REQUIRE(MORE(), REG_EBRACK); 1264 if (!EATTWO('[', '.')) 1265 return(WGETNEXT()); 1266 1267 /* collating symbol */ 1268 value = p_b_coll_elem(p, '.'); 1269 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 1270 return(value); 1271 } 1272 1273 /* 1274 - p_b_coll_elem - parse a collating-element name and look it up 1275 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 1276 */ 1277 static wint_t /* value of collating element */ 1278 p_b_coll_elem(struct parse *p, 1279 wint_t endc) /* name ended by endc,']' */ 1280 { 1281 const char *sp = p->next; 1282 struct cname *cp; 1283 mbstate_t mbs; 1284 wchar_t wc; 1285 size_t clen, len; 1286 1287 while (MORE() && !SEETWO(endc, ']')) 1288 NEXT(); 1289 if (!MORE()) { 1290 SETERROR(REG_EBRACK); 1291 return(0); 1292 } 1293 len = p->next - sp; 1294 for (cp = cnames; cp->name != NULL; cp++) 1295 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1296 return(cp->code); /* known name */ 1297 memset(&mbs, 0, sizeof(mbs)); 1298 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 1299 return (wc); /* single character */ 1300 else if (clen == (size_t)-1 || clen == (size_t)-2) 1301 SETERROR(REG_ILLSEQ); 1302 else 1303 SETERROR(REG_ECOLLATE); /* neither */ 1304 return(0); 1305 } 1306 1307 /* 1308 - may_escape - determine whether 'ch' is escape-able in the current context 1309 == static int may_escape(struct parse *p, const wint_t ch) 1310 */ 1311 static bool 1312 may_escape(struct parse *p, const wint_t ch) 1313 { 1314 1315 if ((p->pflags & PFLAG_LEGACY_ESC) != 0) 1316 return (true); 1317 if (iswalpha(ch) || ch == '\'' || ch == '`') 1318 return (false); 1319 return (true); 1320 #ifdef NOTYET 1321 /* 1322 * Build a whitelist of characters that may be escaped to produce an 1323 * ordinary in the current context. This assumes that these have not 1324 * been otherwise interpreted as a special character. Escaping an 1325 * ordinary character yields undefined results according to 1326 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take 1327 * advantage of this and use escaped ordinary characters to provide 1328 * special meaning, e.g. \b, \B, \w, \W, \s, \S. 1329 */ 1330 switch(ch) { 1331 case '|': 1332 case '+': 1333 case '?': 1334 /* The above characters may not be escaped in BREs */ 1335 if (!(p->g->cflags®_EXTENDED)) 1336 return (false); 1337 /* Fallthrough */ 1338 case '(': 1339 case ')': 1340 case '{': 1341 case '}': 1342 case '.': 1343 case '[': 1344 case ']': 1345 case '\\': 1346 case '*': 1347 case '^': 1348 case '$': 1349 return (true); 1350 default: 1351 return (false); 1352 } 1353 #endif 1354 } 1355 1356 /* 1357 - othercase - return the case counterpart of an alphabetic 1358 == static wint_t othercase(wint_t ch); 1359 */ 1360 static wint_t /* if no counterpart, return ch */ 1361 othercase(wint_t ch) 1362 { 1363 assert(iswalpha(ch)); 1364 if (iswupper(ch)) 1365 return(towlower(ch)); 1366 else if (iswlower(ch)) 1367 return(towupper(ch)); 1368 else /* peculiar, but could happen */ 1369 return(ch); 1370 } 1371 1372 /* 1373 - bothcases - emit a dualcase version of a two-case character 1374 == static void bothcases(struct parse *p, wint_t ch); 1375 * 1376 * Boy, is this implementation ever a kludge... 1377 */ 1378 static void 1379 bothcases(struct parse *p, wint_t ch) 1380 { 1381 const char *oldnext = p->next; 1382 const char *oldend = p->end; 1383 char bracket[3 + MB_LEN_MAX]; 1384 size_t n; 1385 mbstate_t mbs; 1386 1387 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1388 p->next = bracket; 1389 memset(&mbs, 0, sizeof(mbs)); 1390 n = wcrtomb(bracket, ch, &mbs); 1391 assert(n != (size_t)-1); 1392 bracket[n] = ']'; 1393 bracket[n + 1] = '\0'; 1394 p->end = bracket+n+1; 1395 p_bracket(p); 1396 assert(p->next == p->end); 1397 p->next = oldnext; 1398 p->end = oldend; 1399 } 1400 1401 /* 1402 - ordinary - emit an ordinary character 1403 == static void ordinary(struct parse *p, wint_t ch); 1404 */ 1405 static void 1406 ordinary(struct parse *p, wint_t ch) 1407 { 1408 cset *cs; 1409 1410 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 1411 bothcases(p, ch); 1412 else if ((ch & OPDMASK) == ch) 1413 EMIT(OCHAR, ch); 1414 else { 1415 /* 1416 * Kludge: character is too big to fit into an OCHAR operand. 1417 * Emit a singleton set. 1418 */ 1419 if ((cs = allocset(p)) == NULL) 1420 return; 1421 CHadd(p, cs, ch); 1422 EMIT(OANYOF, (int)(cs - p->g->sets)); 1423 } 1424 } 1425 1426 /* 1427 - nonnewline - emit REG_NEWLINE version of OANY 1428 == static void nonnewline(struct parse *p); 1429 * 1430 * Boy, is this implementation ever a kludge... 1431 */ 1432 static void 1433 nonnewline(struct parse *p) 1434 { 1435 const char *oldnext = p->next; 1436 const char *oldend = p->end; 1437 char bracket[4]; 1438 1439 p->next = bracket; 1440 p->end = bracket+3; 1441 bracket[0] = '^'; 1442 bracket[1] = '\n'; 1443 bracket[2] = ']'; 1444 bracket[3] = '\0'; 1445 p_bracket(p); 1446 assert(p->next == bracket+3); 1447 p->next = oldnext; 1448 p->end = oldend; 1449 } 1450 1451 /* 1452 - repeat - generate code for a bounded repetition, recursively if needed 1453 == static void repeat(struct parse *p, sopno start, int from, int to); 1454 */ 1455 static void 1456 repeat(struct parse *p, 1457 sopno start, /* operand from here to end of strip */ 1458 int from, /* repeated from this number */ 1459 int to) /* to this number of times (maybe INFINITY) */ 1460 { 1461 sopno finish = HERE(); 1462 # define N 2 1463 # define INF 3 1464 # define REP(f, t) ((f)*8 + (t)) 1465 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1466 sopno copy; 1467 1468 if (p->error != 0) /* head off possible runaway recursion */ 1469 return; 1470 1471 assert(from <= to); 1472 1473 switch (REP(MAP(from), MAP(to))) { 1474 case REP(0, 0): /* must be user doing this */ 1475 DROP(finish-start); /* drop the operand */ 1476 break; 1477 case REP(0, 1): /* as x{1,1}? */ 1478 case REP(0, N): /* as x{1,n}? */ 1479 case REP(0, INF): /* as x{1,}? */ 1480 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1481 INSERT(OCH_, start); /* offset is wrong... */ 1482 repeat(p, start+1, 1, to); 1483 ASTERN(OOR1, start); 1484 AHEAD(start); /* ... fix it */ 1485 EMIT(OOR2, 0); 1486 AHEAD(THERE()); 1487 ASTERN(O_CH, THERETHERE()); 1488 break; 1489 case REP(1, 1): /* trivial case */ 1490 /* done */ 1491 break; 1492 case REP(1, N): /* as x?x{1,n-1} */ 1493 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1494 INSERT(OCH_, start); 1495 ASTERN(OOR1, start); 1496 AHEAD(start); 1497 EMIT(OOR2, 0); /* offset very wrong... */ 1498 AHEAD(THERE()); /* ...so fix it */ 1499 ASTERN(O_CH, THERETHERE()); 1500 copy = dupl(p, start+1, finish+1); 1501 assert(copy == finish+4); 1502 repeat(p, copy, 1, to-1); 1503 break; 1504 case REP(1, INF): /* as x+ */ 1505 INSERT(OPLUS_, start); 1506 ASTERN(O_PLUS, start); 1507 break; 1508 case REP(N, N): /* as xx{m-1,n-1} */ 1509 copy = dupl(p, start, finish); 1510 repeat(p, copy, from-1, to-1); 1511 break; 1512 case REP(N, INF): /* as xx{n-1,INF} */ 1513 copy = dupl(p, start, finish); 1514 repeat(p, copy, from-1, to); 1515 break; 1516 default: /* "can't happen" */ 1517 SETERROR(REG_ASSERT); /* just in case */ 1518 break; 1519 } 1520 } 1521 1522 /* 1523 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1524 - character from the parse struct, signals a REG_ILLSEQ error if the 1525 - character can't be converted. Returns the number of bytes consumed. 1526 */ 1527 static wint_t 1528 wgetnext(struct parse *p) 1529 { 1530 mbstate_t mbs; 1531 wchar_t wc; 1532 size_t n; 1533 1534 memset(&mbs, 0, sizeof(mbs)); 1535 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1536 if (n == (size_t)-1 || n == (size_t)-2) { 1537 SETERROR(REG_ILLSEQ); 1538 return (0); 1539 } 1540 if (n == 0) 1541 n = 1; 1542 p->next += n; 1543 return (wc); 1544 } 1545 1546 /* 1547 - seterr - set an error condition 1548 == static int seterr(struct parse *p, int e); 1549 */ 1550 static int /* useless but makes type checking happy */ 1551 seterr(struct parse *p, int e) 1552 { 1553 if (p->error == 0) /* keep earliest error condition */ 1554 p->error = e; 1555 p->next = nuls; /* try to bring things to a halt */ 1556 p->end = nuls; 1557 return(0); /* make the return value well-defined */ 1558 } 1559 1560 /* 1561 - allocset - allocate a set of characters for [] 1562 == static cset *allocset(struct parse *p); 1563 */ 1564 static cset * 1565 allocset(struct parse *p) 1566 { 1567 cset *cs, *ncs; 1568 1569 ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs)); 1570 if (ncs == NULL) { 1571 SETERROR(REG_ESPACE); 1572 return (NULL); 1573 } 1574 p->g->sets = ncs; 1575 cs = &p->g->sets[p->g->ncsets++]; 1576 memset(cs, 0, sizeof(*cs)); 1577 1578 return(cs); 1579 } 1580 1581 /* 1582 - freeset - free a now-unused set 1583 == static void freeset(struct parse *p, cset *cs); 1584 */ 1585 static void 1586 freeset(struct parse *p, cset *cs) 1587 { 1588 cset *top = &p->g->sets[p->g->ncsets]; 1589 1590 free(cs->wides); 1591 free(cs->ranges); 1592 free(cs->types); 1593 memset(cs, 0, sizeof(*cs)); 1594 if (cs == top-1) /* recover only the easy case */ 1595 p->g->ncsets--; 1596 } 1597 1598 /* 1599 - singleton - Determine whether a set contains only one character, 1600 - returning it if so, otherwise returning OUT. 1601 */ 1602 static wint_t 1603 singleton(cset *cs) 1604 { 1605 wint_t i, s, n; 1606 1607 /* Exclude the complicated cases we don't want to deal with */ 1608 if (cs->nranges != 0 || cs->ntypes != 0 || cs->icase != 0) 1609 return (OUT); 1610 1611 if (cs->nwides > 1) 1612 return (OUT); 1613 1614 /* Count the number of characters present in the bitmap */ 1615 for (i = n = 0; i < NC; i++) 1616 if (CHIN(cs, i)) { 1617 n++; 1618 s = i; 1619 } 1620 1621 if (n > 1) 1622 return (OUT); 1623 1624 if (n == 1) { 1625 if (cs->nwides == 0) 1626 return (s); 1627 else 1628 return (OUT); 1629 } 1630 if (cs->nwides == 1) 1631 return (cs->wides[0]); 1632 1633 return (OUT); 1634 } 1635 1636 /* 1637 - CHadd - add character to character set. 1638 */ 1639 static void 1640 CHadd(struct parse *p, cset *cs, wint_t ch) 1641 { 1642 wint_t nch, *newwides; 1643 assert(ch >= 0); 1644 if (ch < NC) 1645 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1646 else { 1647 newwides = reallocarray(cs->wides, cs->nwides + 1, 1648 sizeof(*cs->wides)); 1649 if (newwides == NULL) { 1650 SETERROR(REG_ESPACE); 1651 return; 1652 } 1653 cs->wides = newwides; 1654 cs->wides[cs->nwides++] = ch; 1655 } 1656 if (cs->icase) { 1657 if ((nch = towlower(ch)) < NC) 1658 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1659 if ((nch = towupper(ch)) < NC) 1660 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1661 } 1662 } 1663 1664 /* 1665 - CHaddrange - add all characters in the range [min,max] to a character set. 1666 */ 1667 static void 1668 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1669 { 1670 crange *newranges; 1671 1672 for (; min < NC && min <= max; min++) 1673 CHadd(p, cs, min); 1674 if (min >= max) 1675 return; 1676 newranges = reallocarray(cs->ranges, cs->nranges + 1, 1677 sizeof(*cs->ranges)); 1678 if (newranges == NULL) { 1679 SETERROR(REG_ESPACE); 1680 return; 1681 } 1682 cs->ranges = newranges; 1683 cs->ranges[cs->nranges].min = min; 1684 cs->ranges[cs->nranges].max = max; 1685 cs->nranges++; 1686 } 1687 1688 /* 1689 - CHaddtype - add all characters of a certain type to a character set. 1690 */ 1691 static void 1692 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1693 { 1694 wint_t i; 1695 wctype_t *newtypes; 1696 1697 for (i = 0; i < NC; i++) 1698 if (iswctype(i, wct)) 1699 CHadd(p, cs, i); 1700 newtypes = reallocarray(cs->types, cs->ntypes + 1, 1701 sizeof(*cs->types)); 1702 if (newtypes == NULL) { 1703 SETERROR(REG_ESPACE); 1704 return; 1705 } 1706 cs->types = newtypes; 1707 cs->types[cs->ntypes++] = wct; 1708 } 1709 1710 /* 1711 - dupl - emit a duplicate of a bunch of sops 1712 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1713 */ 1714 static sopno /* start of duplicate */ 1715 dupl(struct parse *p, 1716 sopno start, /* from here */ 1717 sopno finish) /* to this less one */ 1718 { 1719 sopno ret = HERE(); 1720 sopno len = finish - start; 1721 1722 assert(finish >= start); 1723 if (len == 0) 1724 return(ret); 1725 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1726 return(ret); 1727 (void) memcpy((char *)(p->strip + p->slen), 1728 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1729 p->slen += len; 1730 return(ret); 1731 } 1732 1733 /* 1734 - doemit - emit a strip operator 1735 == static void doemit(struct parse *p, sop op, size_t opnd); 1736 * 1737 * It might seem better to implement this as a macro with a function as 1738 * hard-case backup, but it's just too big and messy unless there are 1739 * some changes to the data structures. Maybe later. 1740 */ 1741 static void 1742 doemit(struct parse *p, sop op, size_t opnd) 1743 { 1744 /* avoid making error situations worse */ 1745 if (p->error != 0) 1746 return; 1747 1748 /* deal with oversize operands ("can't happen", more or less) */ 1749 assert(opnd < 1<<OPSHIFT); 1750 1751 /* deal with undersized strip */ 1752 if (p->slen >= p->ssize) 1753 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1754 return; 1755 1756 /* finally, it's all reduced to the easy case */ 1757 p->strip[p->slen++] = SOP(op, opnd); 1758 } 1759 1760 /* 1761 - doinsert - insert a sop into the strip 1762 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1763 */ 1764 static void 1765 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1766 { 1767 sopno sn; 1768 sop s; 1769 int i; 1770 1771 /* avoid making error situations worse */ 1772 if (p->error != 0) 1773 return; 1774 1775 sn = HERE(); 1776 EMIT(op, opnd); /* do checks, ensure space */ 1777 assert(HERE() == sn+1); 1778 s = p->strip[sn]; 1779 1780 /* adjust paren pointers */ 1781 assert(pos > 0); 1782 for (i = 1; i < NPAREN; i++) { 1783 if (p->pbegin[i] >= pos) { 1784 p->pbegin[i]++; 1785 } 1786 if (p->pend[i] >= pos) { 1787 p->pend[i]++; 1788 } 1789 } 1790 1791 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1792 (HERE()-pos-1)*sizeof(sop)); 1793 p->strip[pos] = s; 1794 } 1795 1796 /* 1797 - dofwd - complete a forward reference 1798 == static void dofwd(struct parse *p, sopno pos, sop value); 1799 */ 1800 static void 1801 dofwd(struct parse *p, sopno pos, sop value) 1802 { 1803 /* avoid making error situations worse */ 1804 if (p->error != 0) 1805 return; 1806 1807 assert(value < 1<<OPSHIFT); 1808 p->strip[pos] = OP(p->strip[pos]) | value; 1809 } 1810 1811 /* 1812 - enlarge - enlarge the strip 1813 == static int enlarge(struct parse *p, sopno size); 1814 */ 1815 static int 1816 enlarge(struct parse *p, sopno size) 1817 { 1818 sop *sp; 1819 1820 if (p->ssize >= size) 1821 return 1; 1822 1823 sp = reallocarray(p->strip, size, sizeof(sop)); 1824 if (sp == NULL) { 1825 SETERROR(REG_ESPACE); 1826 return 0; 1827 } 1828 p->strip = sp; 1829 p->ssize = size; 1830 return 1; 1831 } 1832 1833 /* 1834 - stripsnug - compact the strip 1835 == static void stripsnug(struct parse *p, struct re_guts *g); 1836 */ 1837 static void 1838 stripsnug(struct parse *p, struct re_guts *g) 1839 { 1840 g->nstates = p->slen; 1841 g->strip = reallocarray((char *)p->strip, p->slen, sizeof(sop)); 1842 if (g->strip == NULL) { 1843 SETERROR(REG_ESPACE); 1844 g->strip = p->strip; 1845 } 1846 } 1847 1848 /* 1849 - findmust - fill in must and mlen with longest mandatory literal string 1850 == static void findmust(struct parse *p, struct re_guts *g); 1851 * 1852 * This algorithm could do fancy things like analyzing the operands of | 1853 * for common subsequences. Someday. This code is simple and finds most 1854 * of the interesting cases. 1855 * 1856 * Note that must and mlen got initialized during setup. 1857 */ 1858 static void 1859 findmust(struct parse *p, struct re_guts *g) 1860 { 1861 sop *scan; 1862 sop *start = NULL; 1863 sop *newstart = NULL; 1864 sopno newlen; 1865 sop s; 1866 char *cp; 1867 int offset; 1868 char buf[MB_LEN_MAX]; 1869 size_t clen; 1870 mbstate_t mbs; 1871 1872 /* avoid making error situations worse */ 1873 if (p->error != 0) 1874 return; 1875 1876 /* 1877 * It's not generally safe to do a ``char'' substring search on 1878 * multibyte character strings, but it's safe for at least 1879 * UTF-8 (see RFC 3629). 1880 */ 1881 if (MB_CUR_MAX > 1 && 1882 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1883 return; 1884 1885 /* find the longest OCHAR sequence in strip */ 1886 newlen = 0; 1887 offset = 0; 1888 g->moffset = 0; 1889 scan = g->strip + 1; 1890 do { 1891 s = *scan++; 1892 switch (OP(s)) { 1893 case OCHAR: /* sequence member */ 1894 if (newlen == 0) { /* new sequence */ 1895 memset(&mbs, 0, sizeof(mbs)); 1896 newstart = scan - 1; 1897 } 1898 clen = wcrtomb(buf, OPND(s), &mbs); 1899 if (clen == (size_t)-1) 1900 goto toohard; 1901 newlen += clen; 1902 break; 1903 case OPLUS_: /* things that don't break one */ 1904 case OLPAREN: 1905 case ORPAREN: 1906 break; 1907 case OQUEST_: /* things that must be skipped */ 1908 case OCH_: 1909 offset = altoffset(scan, offset); 1910 scan--; 1911 do { 1912 scan += OPND(s); 1913 s = *scan; 1914 /* assert() interferes w debug printouts */ 1915 if (OP(s) != (sop)O_QUEST && 1916 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) { 1917 g->iflags |= BAD; 1918 return; 1919 } 1920 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); 1921 /* FALLTHROUGH */ 1922 case OBOW: /* things that break a sequence */ 1923 case OEOW: 1924 case OBOL: 1925 case OEOL: 1926 case OBOS: 1927 case OEOS: 1928 case OWBND: 1929 case ONWBND: 1930 case O_QUEST: 1931 case O_CH: 1932 case OEND: 1933 if (newlen > (sopno)g->mlen) { /* ends one */ 1934 start = newstart; 1935 g->mlen = newlen; 1936 if (offset > -1) { 1937 g->moffset += offset; 1938 offset = newlen; 1939 } else 1940 g->moffset = offset; 1941 } else { 1942 if (offset > -1) 1943 offset += newlen; 1944 } 1945 newlen = 0; 1946 break; 1947 case OANY: 1948 if (newlen > (sopno)g->mlen) { /* ends one */ 1949 start = newstart; 1950 g->mlen = newlen; 1951 if (offset > -1) { 1952 g->moffset += offset; 1953 offset = newlen; 1954 } else 1955 g->moffset = offset; 1956 } else { 1957 if (offset > -1) 1958 offset += newlen; 1959 } 1960 if (offset > -1) 1961 offset++; 1962 newlen = 0; 1963 break; 1964 case OANYOF: /* may or may not invalidate offset */ 1965 /* First, everything as OANY */ 1966 if (newlen > (sopno)g->mlen) { /* ends one */ 1967 start = newstart; 1968 g->mlen = newlen; 1969 if (offset > -1) { 1970 g->moffset += offset; 1971 offset = newlen; 1972 } else 1973 g->moffset = offset; 1974 } else { 1975 if (offset > -1) 1976 offset += newlen; 1977 } 1978 if (offset > -1) 1979 offset++; 1980 newlen = 0; 1981 break; 1982 toohard: 1983 default: 1984 /* Anything here makes it impossible or too hard 1985 * to calculate the offset -- so we give up; 1986 * save the last known good offset, in case the 1987 * must sequence doesn't occur later. 1988 */ 1989 if (newlen > (sopno)g->mlen) { /* ends one */ 1990 start = newstart; 1991 g->mlen = newlen; 1992 if (offset > -1) 1993 g->moffset += offset; 1994 else 1995 g->moffset = offset; 1996 } 1997 offset = -1; 1998 newlen = 0; 1999 break; 2000 } 2001 } while (OP(s) != OEND); 2002 2003 if (g->mlen == 0) { /* there isn't one */ 2004 g->moffset = -1; 2005 return; 2006 } 2007 2008 /* turn it into a character string */ 2009 g->must = malloc((size_t)g->mlen + 1); 2010 if (g->must == NULL) { /* argh; just forget it */ 2011 g->mlen = 0; 2012 g->moffset = -1; 2013 return; 2014 } 2015 cp = g->must; 2016 scan = start; 2017 memset(&mbs, 0, sizeof(mbs)); 2018 while (cp < g->must + g->mlen) { 2019 while (OP(s = *scan++) != OCHAR) 2020 continue; 2021 clen = wcrtomb(cp, OPND(s), &mbs); 2022 assert(clen != (size_t)-1); 2023 cp += clen; 2024 } 2025 assert(cp == g->must + g->mlen); 2026 *cp++ = '\0'; /* just on general principles */ 2027 } 2028 2029 /* 2030 - altoffset - choose biggest offset among multiple choices 2031 == static int altoffset(sop *scan, int offset); 2032 * 2033 * Compute, recursively if necessary, the largest offset among multiple 2034 * re paths. 2035 */ 2036 static int 2037 altoffset(sop *scan, int offset) 2038 { 2039 int largest; 2040 int try; 2041 sop s; 2042 2043 /* If we gave up already on offsets, return */ 2044 if (offset == -1) 2045 return -1; 2046 2047 largest = 0; 2048 try = 0; 2049 s = *scan++; 2050 while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) { 2051 switch (OP(s)) { 2052 case OOR1: 2053 if (try > largest) 2054 largest = try; 2055 try = 0; 2056 break; 2057 case OQUEST_: 2058 case OCH_: 2059 try = altoffset(scan, try); 2060 if (try == -1) 2061 return -1; 2062 scan--; 2063 do { 2064 scan += OPND(s); 2065 s = *scan; 2066 if (OP(s) != (sop)O_QUEST && 2067 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) 2068 return -1; 2069 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); 2070 /* We must skip to the next position, or we'll 2071 * leave altoffset() too early. 2072 */ 2073 scan++; 2074 break; 2075 case OANYOF: 2076 case OCHAR: 2077 case OANY: 2078 try++; 2079 case OBOW: 2080 case OEOW: 2081 case OWBND: 2082 case ONWBND: 2083 case OLPAREN: 2084 case ORPAREN: 2085 case OOR2: 2086 break; 2087 default: 2088 try = -1; 2089 break; 2090 } 2091 if (try == -1) 2092 return -1; 2093 s = *scan++; 2094 } 2095 2096 if (try > largest) 2097 largest = try; 2098 2099 return largest+offset; 2100 } 2101 2102 /* 2103 - computejumps - compute char jumps for BM scan 2104 == static void computejumps(struct parse *p, struct re_guts *g); 2105 * 2106 * This algorithm assumes g->must exists and is has size greater than 2107 * zero. It's based on the algorithm found on Computer Algorithms by 2108 * Sara Baase. 2109 * 2110 * A char jump is the number of characters one needs to jump based on 2111 * the value of the character from the text that was mismatched. 2112 */ 2113 static void 2114 computejumps(struct parse *p, struct re_guts *g) 2115 { 2116 int ch; 2117 int mindex; 2118 2119 /* Avoid making errors worse */ 2120 if (p->error != 0) 2121 return; 2122 2123 g->charjump = (int *)malloc((NC_MAX + 1) * sizeof(int)); 2124 if (g->charjump == NULL) /* Not a fatal error */ 2125 return; 2126 /* Adjust for signed chars, if necessary */ 2127 g->charjump = &g->charjump[-(CHAR_MIN)]; 2128 2129 /* If the character does not exist in the pattern, the jump 2130 * is equal to the number of characters in the pattern. 2131 */ 2132 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 2133 g->charjump[ch] = g->mlen; 2134 2135 /* If the character does exist, compute the jump that would 2136 * take us to the last character in the pattern equal to it 2137 * (notice that we match right to left, so that last character 2138 * is the first one that would be matched). 2139 */ 2140 for (mindex = 0; mindex < g->mlen; mindex++) 2141 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 2142 } 2143 2144 /* 2145 - computematchjumps - compute match jumps for BM scan 2146 == static void computematchjumps(struct parse *p, struct re_guts *g); 2147 * 2148 * This algorithm assumes g->must exists and is has size greater than 2149 * zero. It's based on the algorithm found on Computer Algorithms by 2150 * Sara Baase. 2151 * 2152 * A match jump is the number of characters one needs to advance based 2153 * on the already-matched suffix. 2154 * Notice that all values here are minus (g->mlen-1), because of the way 2155 * the search algorithm works. 2156 */ 2157 static void 2158 computematchjumps(struct parse *p, struct re_guts *g) 2159 { 2160 int mindex; /* General "must" iterator */ 2161 int suffix; /* Keeps track of matching suffix */ 2162 int ssuffix; /* Keeps track of suffixes' suffix */ 2163 int* pmatches; /* pmatches[k] points to the next i 2164 * such that i+1...mlen is a substring 2165 * of k+1...k+mlen-i-1 2166 */ 2167 2168 /* Avoid making errors worse */ 2169 if (p->error != 0) 2170 return; 2171 2172 pmatches = (int*) malloc(g->mlen * sizeof(int)); 2173 if (pmatches == NULL) { 2174 g->matchjump = NULL; 2175 return; 2176 } 2177 2178 g->matchjump = (int*) malloc(g->mlen * sizeof(int)); 2179 if (g->matchjump == NULL) { /* Not a fatal error */ 2180 free(pmatches); 2181 return; 2182 } 2183 2184 /* Set maximum possible jump for each character in the pattern */ 2185 for (mindex = 0; mindex < g->mlen; mindex++) 2186 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 2187 2188 /* Compute pmatches[] */ 2189 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 2190 mindex--, suffix--) { 2191 pmatches[mindex] = suffix; 2192 2193 /* If a mismatch is found, interrupting the substring, 2194 * compute the matchjump for that position. If no 2195 * mismatch is found, then a text substring mismatched 2196 * against the suffix will also mismatch against the 2197 * substring. 2198 */ 2199 while (suffix < g->mlen 2200 && g->must[mindex] != g->must[suffix]) { 2201 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2202 g->mlen - mindex - 1); 2203 suffix = pmatches[suffix]; 2204 } 2205 } 2206 2207 /* Compute the matchjump up to the last substring found to jump 2208 * to the beginning of the largest must pattern prefix matching 2209 * it's own suffix. 2210 */ 2211 for (mindex = 0; mindex <= suffix; mindex++) 2212 g->matchjump[mindex] = MIN(g->matchjump[mindex], 2213 g->mlen + suffix - mindex); 2214 2215 ssuffix = pmatches[suffix]; 2216 while (suffix < g->mlen) { 2217 while (suffix <= ssuffix && suffix < g->mlen) { 2218 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2219 g->mlen + ssuffix - suffix); 2220 suffix++; 2221 } 2222 if (suffix < g->mlen) 2223 ssuffix = pmatches[ssuffix]; 2224 } 2225 2226 free(pmatches); 2227 } 2228 2229 /* 2230 - pluscount - count + nesting 2231 == static sopno pluscount(struct parse *p, struct re_guts *g); 2232 */ 2233 static sopno /* nesting depth */ 2234 pluscount(struct parse *p, struct re_guts *g) 2235 { 2236 sop *scan; 2237 sop s; 2238 sopno plusnest = 0; 2239 sopno maxnest = 0; 2240 2241 if (p->error != 0) 2242 return(0); /* there may not be an OEND */ 2243 2244 scan = g->strip + 1; 2245 do { 2246 s = *scan++; 2247 switch (OP(s)) { 2248 case OPLUS_: 2249 plusnest++; 2250 break; 2251 case O_PLUS: 2252 if (plusnest > maxnest) 2253 maxnest = plusnest; 2254 plusnest--; 2255 break; 2256 } 2257 } while (OP(s) != OEND); 2258 if (plusnest != 0) 2259 g->iflags |= BAD; 2260 return(maxnest); 2261 } 2262