1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Copyright (c) 2011 The FreeBSD Foundation 9 * All rights reserved. 10 * Portions of this software were developed by David Chisnall 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * Henry Spencer. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 41 */ 42 43 #if defined(LIBC_SCCS) && !defined(lint) 44 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 45 #endif /* LIBC_SCCS and not lint */ 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/types.h> 50 #include <stdio.h> 51 #include <string.h> 52 #include <ctype.h> 53 #include <limits.h> 54 #include <stdlib.h> 55 #include <regex.h> 56 #include <stdbool.h> 57 #include <wchar.h> 58 #include <wctype.h> 59 60 #ifndef LIBREGEX 61 #include "collate.h" 62 #endif 63 64 #include "utils.h" 65 #include "regex2.h" 66 67 #include "cname.h" 68 69 /* 70 * Branching context, used to keep track of branch state for all of the branch- 71 * aware functions. In addition to keeping track of branch positions for the 72 * p_branch_* functions, we use this to simplify some clumsiness in BREs for 73 * detection of whether ^ is acting as an anchor or being used erroneously and 74 * also for whether we're in a sub-expression or not. 75 */ 76 struct branchc { 77 sopno start; 78 sopno back; 79 sopno fwd; 80 81 int nbranch; 82 int nchain; 83 bool outer; 84 bool terminate; 85 }; 86 87 /* 88 * parse structure, passed up and down to avoid global variables and 89 * other clumsinesses 90 */ 91 struct parse { 92 const char *next; /* next character in RE */ 93 const char *end; /* end of string (-> NUL normally) */ 94 int error; /* has an error been seen? */ 95 int gnuext; 96 sop *strip; /* malloced strip */ 97 sopno ssize; /* malloced strip size (allocated) */ 98 sopno slen; /* malloced strip length (used) */ 99 int ncsalloc; /* number of csets allocated */ 100 struct re_guts *g; 101 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 102 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 103 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 104 bool allowbranch; /* can this expression branch? */ 105 bool bre; /* convenience; is this a BRE? */ 106 int pflags; /* other parsing flags -- legacy escapes? */ 107 bool (*parse_expr)(struct parse *, struct branchc *); 108 void (*pre_parse)(struct parse *, struct branchc *); 109 void (*post_parse)(struct parse *, struct branchc *); 110 }; 111 112 #define PFLAG_LEGACY_ESC 0x00000001 113 114 /* ========= begin header generated by ./mkh ========= */ 115 #ifdef __cplusplus 116 extern "C" { 117 #endif 118 119 /* === regcomp.c === */ 120 static bool p_ere_exp(struct parse *p, struct branchc *bc); 121 static void p_str(struct parse *p); 122 static int p_branch_eat_delim(struct parse *p, struct branchc *bc); 123 static void p_branch_ins_offset(struct parse *p, struct branchc *bc); 124 static void p_branch_fix_tail(struct parse *p, struct branchc *bc); 125 static bool p_branch_empty(struct parse *p, struct branchc *bc); 126 static bool p_branch_do(struct parse *p, struct branchc *bc); 127 static void p_bre_pre_parse(struct parse *p, struct branchc *bc); 128 static void p_bre_post_parse(struct parse *p, struct branchc *bc); 129 static void p_re(struct parse *p, int end1, int end2); 130 static bool p_simp_re(struct parse *p, struct branchc *bc); 131 static int p_count(struct parse *p); 132 static void p_bracket(struct parse *p); 133 static int p_range_cmp(wchar_t c1, wchar_t c2); 134 static void p_b_term(struct parse *p, cset *cs); 135 static int p_b_pseudoclass(struct parse *p, char c); 136 static void p_b_cclass(struct parse *p, cset *cs); 137 static void p_b_cclass_named(struct parse *p, cset *cs, const char[]); 138 static void p_b_eclass(struct parse *p, cset *cs); 139 static wint_t p_b_symbol(struct parse *p); 140 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 141 static bool may_escape(struct parse *p, const wint_t ch); 142 static wint_t othercase(wint_t ch); 143 static void bothcases(struct parse *p, wint_t ch); 144 static void ordinary(struct parse *p, wint_t ch); 145 static void nonnewline(struct parse *p); 146 static void repeat(struct parse *p, sopno start, int from, int to); 147 static int seterr(struct parse *p, int e); 148 static cset *allocset(struct parse *p); 149 static void freeset(struct parse *p, cset *cs); 150 static void CHadd(struct parse *p, cset *cs, wint_t ch); 151 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 152 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 153 static wint_t singleton(cset *cs); 154 static sopno dupl(struct parse *p, sopno start, sopno finish); 155 static void doemit(struct parse *p, sop op, size_t opnd); 156 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 157 static void dofwd(struct parse *p, sopno pos, sop value); 158 static int enlarge(struct parse *p, sopno size); 159 static void stripsnug(struct parse *p, struct re_guts *g); 160 static void findmust(struct parse *p, struct re_guts *g); 161 static int altoffset(sop *scan, int offset); 162 static void computejumps(struct parse *p, struct re_guts *g); 163 static void computematchjumps(struct parse *p, struct re_guts *g); 164 static sopno pluscount(struct parse *p, struct re_guts *g); 165 static wint_t wgetnext(struct parse *p); 166 167 #ifdef __cplusplus 168 } 169 #endif 170 /* ========= end header generated by ./mkh ========= */ 171 172 static char nuls[10]; /* place to point scanner in event of error */ 173 174 /* 175 * macros for use with parse structure 176 * BEWARE: these know that the parse structure is named `p' !!! 177 */ 178 #define PEEK() (*p->next) 179 #define PEEK2() (*(p->next+1)) 180 #define MORE() (p->next < p->end) 181 #define MORE2() (p->next+1 < p->end) 182 #define SEE(c) (MORE() && PEEK() == (c)) 183 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 184 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) 185 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 186 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 187 #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a)) 188 #define NEXT() (p->next++) 189 #define NEXT2() (p->next += 2) 190 #define NEXTn(n) (p->next += (n)) 191 #define GETNEXT() (*p->next++) 192 #define WGETNEXT() wgetnext(p) 193 #define SETERROR(e) seterr(p, (e)) 194 #define REQUIRE(co, e) ((co) || SETERROR(e)) 195 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 196 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 197 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 198 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 199 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 200 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 201 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 202 #define HERE() (p->slen) 203 #define THERE() (p->slen - 1) 204 #define THERETHERE() (p->slen - 2) 205 #define DROP(n) (p->slen -= (n)) 206 207 /* Macro used by computejump()/computematchjump() */ 208 #define MIN(a,b) ((a)<(b)?(a):(b)) 209 210 static int /* 0 success, otherwise REG_something */ 211 regcomp_internal(regex_t * __restrict preg, 212 const char * __restrict pattern, 213 int cflags, int pflags) 214 { 215 struct parse pa; 216 struct re_guts *g; 217 struct parse *p = &pa; 218 int i; 219 size_t len; 220 size_t maxlen; 221 #ifdef REDEBUG 222 # define GOODFLAGS(f) (f) 223 #else 224 # define GOODFLAGS(f) ((f)&~REG_DUMP) 225 #endif 226 227 cflags = GOODFLAGS(cflags); 228 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 229 return(REG_INVARG); 230 231 if (cflags®_PEND) { 232 if (preg->re_endp < pattern) 233 return(REG_INVARG); 234 len = preg->re_endp - pattern; 235 } else 236 len = strlen(pattern); 237 238 /* do the mallocs early so failure handling is easy */ 239 g = (struct re_guts *)malloc(sizeof(struct re_guts)); 240 if (g == NULL) 241 return(REG_ESPACE); 242 /* 243 * Limit the pattern space to avoid a 32-bit overflow on buffer 244 * extension. Also avoid any signed overflow in case of conversion 245 * so make the real limit based on a 31-bit overflow. 246 * 247 * Likely not applicable on 64-bit systems but handle the case 248 * generically (who are we to stop people from using ~715MB+ 249 * patterns?). 250 */ 251 maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3; 252 if (len >= maxlen) { 253 free((char *)g); 254 return(REG_ESPACE); 255 } 256 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 257 assert(p->ssize >= len); 258 259 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 260 p->slen = 0; 261 if (p->strip == NULL) { 262 free((char *)g); 263 return(REG_ESPACE); 264 } 265 266 /* set things up */ 267 p->g = g; 268 p->next = pattern; /* convenience; we do not modify it */ 269 p->end = p->next + len; 270 p->error = 0; 271 p->ncsalloc = 0; 272 p->pflags = pflags; 273 for (i = 0; i < NPAREN; i++) { 274 p->pbegin[i] = 0; 275 p->pend[i] = 0; 276 } 277 #ifdef LIBREGEX 278 if (cflags®_POSIX) { 279 p->gnuext = false; 280 p->allowbranch = (cflags & REG_EXTENDED) != 0; 281 } else 282 p->gnuext = p->allowbranch = true; 283 #else 284 p->gnuext = false; 285 p->allowbranch = (cflags & REG_EXTENDED) != 0; 286 #endif 287 if (cflags & REG_EXTENDED) { 288 p->bre = false; 289 p->parse_expr = p_ere_exp; 290 p->pre_parse = NULL; 291 p->post_parse = NULL; 292 } else { 293 p->bre = true; 294 p->parse_expr = p_simp_re; 295 p->pre_parse = p_bre_pre_parse; 296 p->post_parse = p_bre_post_parse; 297 } 298 g->sets = NULL; 299 g->ncsets = 0; 300 g->cflags = cflags; 301 g->iflags = 0; 302 g->nbol = 0; 303 g->neol = 0; 304 g->must = NULL; 305 g->moffset = -1; 306 g->charjump = NULL; 307 g->matchjump = NULL; 308 g->mlen = 0; 309 g->nsub = 0; 310 g->backrefs = 0; 311 312 /* do it */ 313 EMIT(OEND, 0); 314 g->firststate = THERE(); 315 if (cflags & REG_NOSPEC) 316 p_str(p); 317 else 318 p_re(p, OUT, OUT); 319 EMIT(OEND, 0); 320 g->laststate = THERE(); 321 322 /* tidy up loose ends and fill things in */ 323 stripsnug(p, g); 324 findmust(p, g); 325 /* only use Boyer-Moore algorithm if the pattern is bigger 326 * than three characters 327 */ 328 if(g->mlen > 3) { 329 computejumps(p, g); 330 computematchjumps(p, g); 331 if(g->matchjump == NULL && g->charjump != NULL) { 332 free(g->charjump); 333 g->charjump = NULL; 334 } 335 } 336 g->nplus = pluscount(p, g); 337 g->magic = MAGIC2; 338 preg->re_nsub = g->nsub; 339 preg->re_g = g; 340 preg->re_magic = MAGIC1; 341 #ifndef REDEBUG 342 /* not debugging, so can't rely on the assert() in regexec() */ 343 if (g->iflags&BAD) 344 SETERROR(REG_ASSERT); 345 #endif 346 347 /* win or lose, we're done */ 348 if (p->error != 0) /* lose */ 349 regfree(preg); 350 return(p->error); 351 } 352 353 /* 354 - regcomp - interface for parser and compilation 355 = extern int regcomp(regex_t *, const char *, int); 356 = #define REG_BASIC 0000 357 = #define REG_EXTENDED 0001 358 = #define REG_ICASE 0002 359 = #define REG_NOSUB 0004 360 = #define REG_NEWLINE 0010 361 = #define REG_NOSPEC 0020 362 = #define REG_PEND 0040 363 = #define REG_DUMP 0200 364 */ 365 int /* 0 success, otherwise REG_something */ 366 regcomp(regex_t * __restrict preg, 367 const char * __restrict pattern, 368 int cflags) 369 { 370 371 return (regcomp_internal(preg, pattern, cflags, 0)); 372 } 373 374 #ifndef LIBREGEX 375 /* 376 * Legacy interface that requires more lax escaping behavior. 377 */ 378 int 379 freebsd12_regcomp(regex_t * __restrict preg, 380 const char * __restrict pattern, 381 int cflags, int pflags) 382 { 383 384 return (regcomp_internal(preg, pattern, cflags, PFLAG_LEGACY_ESC)); 385 } 386 387 __sym_compat(regcomp, freebsd12_regcomp, FBSD_1.0); 388 #endif /* !LIBREGEX */ 389 390 /* 391 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, 392 - return whether we should terminate or not 393 == static bool p_ere_exp(struct parse *p); 394 */ 395 static bool 396 p_ere_exp(struct parse *p, struct branchc *bc) 397 { 398 char c; 399 wint_t wc; 400 sopno pos; 401 int count; 402 int count2; 403 #ifdef LIBREGEX 404 int i; 405 int handled; 406 #endif 407 sopno subno; 408 int wascaret = 0; 409 410 (void)bc; 411 assert(MORE()); /* caller should have ensured this */ 412 c = GETNEXT(); 413 414 #ifdef LIBREGEX 415 handled = 0; 416 #endif 417 pos = HERE(); 418 switch (c) { 419 case '(': 420 (void)REQUIRE(MORE(), REG_EPAREN); 421 p->g->nsub++; 422 subno = p->g->nsub; 423 if (subno < NPAREN) 424 p->pbegin[subno] = HERE(); 425 EMIT(OLPAREN, subno); 426 if (!SEE(')')) 427 p_re(p, ')', IGN); 428 if (subno < NPAREN) { 429 p->pend[subno] = HERE(); 430 assert(p->pend[subno] != 0); 431 } 432 EMIT(ORPAREN, subno); 433 (void)MUSTEAT(')', REG_EPAREN); 434 break; 435 #ifndef POSIX_MISTAKE 436 case ')': /* happens only if no current unmatched ( */ 437 /* 438 * You may ask, why the ifndef? Because I didn't notice 439 * this until slightly too late for 1003.2, and none of the 440 * other 1003.2 regular-expression reviewers noticed it at 441 * all. So an unmatched ) is legal POSIX, at least until 442 * we can get it fixed. 443 */ 444 SETERROR(REG_EPAREN); 445 break; 446 #endif 447 case '^': 448 EMIT(OBOL, 0); 449 p->g->iflags |= USEBOL; 450 p->g->nbol++; 451 wascaret = 1; 452 break; 453 case '$': 454 EMIT(OEOL, 0); 455 p->g->iflags |= USEEOL; 456 p->g->neol++; 457 break; 458 case '|': 459 SETERROR(REG_EMPTY); 460 break; 461 case '*': 462 case '+': 463 case '?': 464 case '{': 465 SETERROR(REG_BADRPT); 466 break; 467 case '.': 468 if (p->g->cflags®_NEWLINE) 469 nonnewline(p); 470 else 471 EMIT(OANY, 0); 472 break; 473 case '[': 474 p_bracket(p); 475 break; 476 case '\\': 477 (void)REQUIRE(MORE(), REG_EESCAPE); 478 wc = WGETNEXT(); 479 #ifdef LIBREGEX 480 if (p->gnuext) { 481 handled = 1; 482 switch (wc) { 483 case '`': 484 EMIT(OBOS, 0); 485 break; 486 case '\'': 487 EMIT(OEOS, 0); 488 break; 489 case 'B': 490 EMIT(ONWBND, 0); 491 break; 492 case 'b': 493 EMIT(OWBND, 0); 494 break; 495 case 'W': 496 case 'w': 497 case 'S': 498 case 's': 499 p_b_pseudoclass(p, wc); 500 break; 501 case '1': 502 case '2': 503 case '3': 504 case '4': 505 case '5': 506 case '6': 507 case '7': 508 case '8': 509 case '9': 510 i = wc - '0'; 511 assert(i < NPAREN); 512 if (p->pend[i] != 0) { 513 assert(i <= p->g->nsub); 514 EMIT(OBACK_, i); 515 assert(p->pbegin[i] != 0); 516 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 517 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 518 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 519 EMIT(O_BACK, i); 520 } else 521 SETERROR(REG_ESUBREG); 522 p->g->backrefs = 1; 523 break; 524 default: 525 handled = 0; 526 } 527 /* Don't proceed to the POSIX bits if we've already handled it */ 528 if (handled) 529 break; 530 } 531 #endif 532 switch (wc) { 533 case '<': 534 EMIT(OBOW, 0); 535 break; 536 case '>': 537 EMIT(OEOW, 0); 538 break; 539 default: 540 if (may_escape(p, wc)) 541 ordinary(p, wc); 542 else 543 SETERROR(REG_EESCAPE); 544 break; 545 } 546 break; 547 default: 548 if (p->error != 0) 549 return (false); 550 p->next--; 551 wc = WGETNEXT(); 552 ordinary(p, wc); 553 break; 554 } 555 556 if (!MORE()) 557 return (false); 558 c = PEEK(); 559 /* we call { a repetition if followed by a digit */ 560 if (!( c == '*' || c == '+' || c == '?' || c == '{')) 561 return (false); /* no repetition, we're done */ 562 else if (c == '{') 563 (void)REQUIRE(MORE2() && \ 564 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); 565 NEXT(); 566 567 (void)REQUIRE(!wascaret, REG_BADRPT); 568 switch (c) { 569 case '*': /* implemented as +? */ 570 /* this case does not require the (y|) trick, noKLUDGE */ 571 INSERT(OPLUS_, pos); 572 ASTERN(O_PLUS, pos); 573 INSERT(OQUEST_, pos); 574 ASTERN(O_QUEST, pos); 575 break; 576 case '+': 577 INSERT(OPLUS_, pos); 578 ASTERN(O_PLUS, pos); 579 break; 580 case '?': 581 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 582 INSERT(OCH_, pos); /* offset slightly wrong */ 583 ASTERN(OOR1, pos); /* this one's right */ 584 AHEAD(pos); /* fix the OCH_ */ 585 EMIT(OOR2, 0); /* offset very wrong... */ 586 AHEAD(THERE()); /* ...so fix it */ 587 ASTERN(O_CH, THERETHERE()); 588 break; 589 case '{': 590 count = p_count(p); 591 if (EAT(',')) { 592 if (isdigit((uch)PEEK())) { 593 count2 = p_count(p); 594 (void)REQUIRE(count <= count2, REG_BADBR); 595 } else /* single number with comma */ 596 count2 = INFINITY; 597 } else /* just a single number */ 598 count2 = count; 599 repeat(p, pos, count, count2); 600 if (!EAT('}')) { /* error heuristics */ 601 while (MORE() && PEEK() != '}') 602 NEXT(); 603 (void)REQUIRE(MORE(), REG_EBRACE); 604 SETERROR(REG_BADBR); 605 } 606 break; 607 } 608 609 if (!MORE()) 610 return (false); 611 c = PEEK(); 612 if (!( c == '*' || c == '+' || c == '?' || 613 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 614 return (false); 615 SETERROR(REG_BADRPT); 616 return (false); 617 } 618 619 /* 620 - p_str - string (no metacharacters) "parser" 621 == static void p_str(struct parse *p); 622 */ 623 static void 624 p_str(struct parse *p) 625 { 626 (void)REQUIRE(MORE(), REG_EMPTY); 627 while (MORE()) 628 ordinary(p, WGETNEXT()); 629 } 630 631 /* 632 * Eat consecutive branch delimiters for the kind of expression that we are 633 * parsing, return the number of delimiters that we ate. 634 */ 635 static int 636 p_branch_eat_delim(struct parse *p, struct branchc *bc) 637 { 638 int nskip; 639 640 (void)bc; 641 nskip = 0; 642 while (EATSPEC('|')) 643 ++nskip; 644 return (nskip); 645 } 646 647 /* 648 * Insert necessary branch book-keeping operations. This emits a 649 * bogus 'next' offset, since we still have more to parse 650 */ 651 static void 652 p_branch_ins_offset(struct parse *p, struct branchc *bc) 653 { 654 655 if (bc->nbranch == 0) { 656 INSERT(OCH_, bc->start); /* offset is wrong */ 657 bc->fwd = bc->start; 658 bc->back = bc->start; 659 } 660 661 ASTERN(OOR1, bc->back); 662 bc->back = THERE(); 663 AHEAD(bc->fwd); /* fix previous offset */ 664 bc->fwd = HERE(); 665 EMIT(OOR2, 0); /* offset is very wrong */ 666 ++bc->nbranch; 667 } 668 669 /* 670 * Fix the offset of the tail branch, if we actually had any branches. 671 * This is to correct the bogus placeholder offset that we use. 672 */ 673 static void 674 p_branch_fix_tail(struct parse *p, struct branchc *bc) 675 { 676 677 /* Fix bogus offset at the tail if we actually have branches */ 678 if (bc->nbranch > 0) { 679 AHEAD(bc->fwd); 680 ASTERN(O_CH, bc->back); 681 } 682 } 683 684 /* 685 * Signal to the parser that an empty branch has been encountered; this will, 686 * in the future, be used to allow for more permissive behavior with empty 687 * branches. The return value should indicate whether parsing may continue 688 * or not. 689 */ 690 static bool 691 p_branch_empty(struct parse *p, struct branchc *bc) 692 { 693 694 #if defined(LIBREGEX) && defined(NOTYET) 695 if (bc->outer) 696 p->g->iflags |= EMPTBR; 697 return (true); 698 #else 699 (void)bc; 700 SETERROR(REG_EMPTY); 701 return (false); 702 #endif 703 } 704 705 /* 706 * Take care of any branching requirements. This includes inserting the 707 * appropriate branching instructions as well as eating all of the branch 708 * delimiters until we either run out of pattern or need to parse more pattern. 709 */ 710 static bool 711 p_branch_do(struct parse *p, struct branchc *bc) 712 { 713 int ate = 0; 714 715 ate = p_branch_eat_delim(p, bc); 716 if (ate == 0) 717 return (false); 718 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) 719 /* 720 * Halt parsing only if we have an empty branch and p_branch_empty 721 * indicates that we must not continue. In the future, this will not 722 * necessarily be an error. 723 */ 724 return (false); 725 p_branch_ins_offset(p, bc); 726 727 return (true); 728 } 729 730 static void 731 p_bre_pre_parse(struct parse *p, struct branchc *bc) 732 { 733 734 (void) bc; 735 /* 736 * Does not move cleanly into expression parser because of 737 * ordinary interpration of * at the beginning position of 738 * an expression. 739 */ 740 if (EAT('^')) { 741 EMIT(OBOL, 0); 742 p->g->iflags |= USEBOL; 743 p->g->nbol++; 744 } 745 } 746 747 static void 748 p_bre_post_parse(struct parse *p, struct branchc *bc) 749 { 750 751 /* Expression is terminating due to EOL token */ 752 if (bc->terminate) { 753 DROP(1); 754 EMIT(OEOL, 0); 755 p->g->iflags |= USEEOL; 756 p->g->neol++; 757 } 758 } 759 760 /* 761 - p_re - Top level parser, concatenation and BRE anchoring 762 == static void p_re(struct parse *p, int end1, int end2); 763 * Giving end1 as OUT essentially eliminates the end1/end2 check. 764 * 765 * This implementation is a bit of a kludge, in that a trailing $ is first 766 * taken as an ordinary character and then revised to be an anchor. 767 * The amount of lookahead needed to avoid this kludge is excessive. 768 */ 769 static void 770 p_re(struct parse *p, 771 int end1, /* first terminating character */ 772 int end2) /* second terminating character; ignored for EREs */ 773 { 774 struct branchc bc; 775 776 bc.nbranch = 0; 777 if (end1 == OUT && end2 == OUT) 778 bc.outer = true; 779 else 780 bc.outer = false; 781 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) 782 for (;;) { 783 bc.start = HERE(); 784 bc.nchain = 0; 785 bc.terminate = false; 786 if (p->pre_parse != NULL) 787 p->pre_parse(p, &bc); 788 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) { 789 bc.terminate = p->parse_expr(p, &bc); 790 ++bc.nchain; 791 } 792 if (p->post_parse != NULL) 793 p->post_parse(p, &bc); 794 (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY); 795 #ifdef LIBREGEX 796 if (HERE() == bc.start && !p_branch_empty(p, &bc)) 797 break; 798 #endif 799 if (!p->allowbranch) 800 break; 801 /* 802 * p_branch_do's return value indicates whether we should 803 * continue parsing or not. This is both for correctness and 804 * a slight optimization, because it will check if we've 805 * encountered an empty branch or the end of the string 806 * immediately following a branch delimiter. 807 */ 808 if (!p_branch_do(p, &bc)) 809 break; 810 } 811 #undef SEE_END 812 if (p->allowbranch) 813 p_branch_fix_tail(p, &bc); 814 assert(!MORE() || SEE(end1)); 815 } 816 817 /* 818 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 819 == static bool p_simp_re(struct parse *p, struct branchc *bc); 820 */ 821 static bool /* was the simple RE an unbackslashed $? */ 822 p_simp_re(struct parse *p, struct branchc *bc) 823 { 824 int c; 825 int cc; /* convenient/control character */ 826 int count; 827 int count2; 828 sopno pos; 829 bool handled; 830 int i; 831 wint_t wc; 832 sopno subno; 833 # define BACKSL (1<<CHAR_BIT) 834 835 pos = HERE(); /* repetition op, if any, covers from here */ 836 handled = false; 837 838 assert(MORE()); /* caller should have ensured this */ 839 c = GETNEXT(); 840 if (c == '\\') { 841 (void)REQUIRE(MORE(), REG_EESCAPE); 842 cc = GETNEXT(); 843 c = BACKSL | cc; 844 #ifdef LIBREGEX 845 if (p->gnuext) { 846 handled = true; 847 switch (c) { 848 case BACKSL|'`': 849 EMIT(OBOS, 0); 850 break; 851 case BACKSL|'\'': 852 EMIT(OEOS, 0); 853 break; 854 case BACKSL|'B': 855 EMIT(ONWBND, 0); 856 break; 857 case BACKSL|'b': 858 EMIT(OWBND, 0); 859 break; 860 case BACKSL|'W': 861 case BACKSL|'w': 862 case BACKSL|'S': 863 case BACKSL|'s': 864 p_b_pseudoclass(p, cc); 865 break; 866 default: 867 handled = false; 868 } 869 } 870 #endif 871 } 872 if (!handled) { 873 switch (c) { 874 case '.': 875 if (p->g->cflags®_NEWLINE) 876 nonnewline(p); 877 else 878 EMIT(OANY, 0); 879 break; 880 case '[': 881 p_bracket(p); 882 break; 883 case BACKSL|'<': 884 EMIT(OBOW, 0); 885 break; 886 case BACKSL|'>': 887 EMIT(OEOW, 0); 888 break; 889 case BACKSL|'{': 890 SETERROR(REG_BADRPT); 891 break; 892 case BACKSL|'(': 893 p->g->nsub++; 894 subno = p->g->nsub; 895 if (subno < NPAREN) 896 p->pbegin[subno] = HERE(); 897 EMIT(OLPAREN, subno); 898 /* the MORE here is an error heuristic */ 899 if (MORE() && !SEETWO('\\', ')')) 900 p_re(p, '\\', ')'); 901 if (subno < NPAREN) { 902 p->pend[subno] = HERE(); 903 assert(p->pend[subno] != 0); 904 } 905 EMIT(ORPAREN, subno); 906 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 907 break; 908 case BACKSL|')': /* should not get here -- must be user */ 909 SETERROR(REG_EPAREN); 910 break; 911 case BACKSL|'1': 912 case BACKSL|'2': 913 case BACKSL|'3': 914 case BACKSL|'4': 915 case BACKSL|'5': 916 case BACKSL|'6': 917 case BACKSL|'7': 918 case BACKSL|'8': 919 case BACKSL|'9': 920 i = (c&~BACKSL) - '0'; 921 assert(i < NPAREN); 922 if (p->pend[i] != 0) { 923 assert(i <= p->g->nsub); 924 EMIT(OBACK_, i); 925 assert(p->pbegin[i] != 0); 926 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 927 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 928 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 929 EMIT(O_BACK, i); 930 } else 931 SETERROR(REG_ESUBREG); 932 p->g->backrefs = 1; 933 break; 934 case '*': 935 /* 936 * Ordinary if used as the first character beyond BOL anchor of 937 * a (sub-)expression, counts as a bad repetition operator if it 938 * appears otherwise. 939 */ 940 (void)REQUIRE(bc->nchain == 0, REG_BADRPT); 941 /* FALLTHROUGH */ 942 default: 943 if (p->error != 0) 944 return (false); /* Definitely not $... */ 945 p->next--; 946 wc = WGETNEXT(); 947 if ((c & BACKSL) == 0 || may_escape(p, wc)) 948 ordinary(p, wc); 949 else 950 SETERROR(REG_EESCAPE); 951 break; 952 } 953 } 954 955 if (EAT('*')) { /* implemented as +? */ 956 /* this case does not require the (y|) trick, noKLUDGE */ 957 INSERT(OPLUS_, pos); 958 ASTERN(O_PLUS, pos); 959 INSERT(OQUEST_, pos); 960 ASTERN(O_QUEST, pos); 961 #ifdef LIBREGEX 962 } else if (p->gnuext && EATTWO('\\', '?')) { 963 INSERT(OQUEST_, pos); 964 ASTERN(O_QUEST, pos); 965 } else if (p->gnuext && EATTWO('\\', '+')) { 966 INSERT(OPLUS_, pos); 967 ASTERN(O_PLUS, pos); 968 #endif 969 } else if (EATTWO('\\', '{')) { 970 count = p_count(p); 971 if (EAT(',')) { 972 if (MORE() && isdigit((uch)PEEK())) { 973 count2 = p_count(p); 974 (void)REQUIRE(count <= count2, REG_BADBR); 975 } else /* single number with comma */ 976 count2 = INFINITY; 977 } else /* just a single number */ 978 count2 = count; 979 repeat(p, pos, count, count2); 980 if (!EATTWO('\\', '}')) { /* error heuristics */ 981 while (MORE() && !SEETWO('\\', '}')) 982 NEXT(); 983 (void)REQUIRE(MORE(), REG_EBRACE); 984 SETERROR(REG_BADBR); 985 } 986 } else if (c == '$') /* $ (but not \$) ends it */ 987 return (true); 988 989 return (false); 990 } 991 992 /* 993 - p_count - parse a repetition count 994 == static int p_count(struct parse *p); 995 */ 996 static int /* the value */ 997 p_count(struct parse *p) 998 { 999 int count = 0; 1000 int ndigits = 0; 1001 1002 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 1003 count = count*10 + (GETNEXT() - '0'); 1004 ndigits++; 1005 } 1006 1007 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 1008 return(count); 1009 } 1010 1011 /* 1012 - p_bracket - parse a bracketed character list 1013 == static void p_bracket(struct parse *p); 1014 */ 1015 static void 1016 p_bracket(struct parse *p) 1017 { 1018 cset *cs; 1019 wint_t ch; 1020 1021 /* Dept of Truly Sickening Special-Case Kludges */ 1022 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 1023 EMIT(OBOW, 0); 1024 NEXTn(6); 1025 return; 1026 } 1027 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 1028 EMIT(OEOW, 0); 1029 NEXTn(6); 1030 return; 1031 } 1032 1033 if ((cs = allocset(p)) == NULL) 1034 return; 1035 1036 if (p->g->cflags®_ICASE) 1037 cs->icase = 1; 1038 if (EAT('^')) 1039 cs->invert = 1; 1040 if (EAT(']')) 1041 CHadd(p, cs, ']'); 1042 else if (EAT('-')) 1043 CHadd(p, cs, '-'); 1044 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 1045 p_b_term(p, cs); 1046 if (EAT('-')) 1047 CHadd(p, cs, '-'); 1048 (void)MUSTEAT(']', REG_EBRACK); 1049 1050 if (p->error != 0) /* don't mess things up further */ 1051 return; 1052 1053 if (cs->invert && p->g->cflags®_NEWLINE) 1054 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 1055 1056 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 1057 ordinary(p, ch); 1058 freeset(p, cs); 1059 } else 1060 EMIT(OANYOF, (int)(cs - p->g->sets)); 1061 } 1062 1063 static int 1064 p_range_cmp(wchar_t c1, wchar_t c2) 1065 { 1066 #ifndef LIBREGEX 1067 return __wcollate_range_cmp(c1, c2); 1068 #else 1069 /* Copied from libc/collate __wcollate_range_cmp */ 1070 wchar_t s1[2], s2[2]; 1071 1072 s1[0] = c1; 1073 s1[1] = L'\0'; 1074 s2[0] = c2; 1075 s2[1] = L'\0'; 1076 return (wcscoll(s1, s2)); 1077 #endif 1078 } 1079 1080 /* 1081 - p_b_term - parse one term of a bracketed character list 1082 == static void p_b_term(struct parse *p, cset *cs); 1083 */ 1084 static void 1085 p_b_term(struct parse *p, cset *cs) 1086 { 1087 char c; 1088 wint_t start, finish; 1089 wint_t i; 1090 #ifndef LIBREGEX 1091 struct xlocale_collate *table = 1092 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 1093 #endif 1094 /* classify what we've got */ 1095 switch ((MORE()) ? PEEK() : '\0') { 1096 case '[': 1097 c = (MORE2()) ? PEEK2() : '\0'; 1098 break; 1099 case '-': 1100 SETERROR(REG_ERANGE); 1101 return; /* NOTE RETURN */ 1102 default: 1103 c = '\0'; 1104 break; 1105 } 1106 1107 switch (c) { 1108 case ':': /* character class */ 1109 NEXT2(); 1110 (void)REQUIRE(MORE(), REG_EBRACK); 1111 c = PEEK(); 1112 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 1113 p_b_cclass(p, cs); 1114 (void)REQUIRE(MORE(), REG_EBRACK); 1115 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 1116 break; 1117 case '=': /* equivalence class */ 1118 NEXT2(); 1119 (void)REQUIRE(MORE(), REG_EBRACK); 1120 c = PEEK(); 1121 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 1122 p_b_eclass(p, cs); 1123 (void)REQUIRE(MORE(), REG_EBRACK); 1124 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 1125 break; 1126 default: /* symbol, ordinary character, or range */ 1127 start = p_b_symbol(p); 1128 if (SEE('-') && MORE2() && PEEK2() != ']') { 1129 /* range */ 1130 NEXT(); 1131 if (EAT('-')) 1132 finish = '-'; 1133 else 1134 finish = p_b_symbol(p); 1135 } else 1136 finish = start; 1137 if (start == finish) 1138 CHadd(p, cs, start); 1139 else { 1140 #ifndef LIBREGEX 1141 if (table->__collate_load_error || MB_CUR_MAX > 1) { 1142 #else 1143 if (MB_CUR_MAX > 1) { 1144 #endif 1145 (void)REQUIRE(start <= finish, REG_ERANGE); 1146 CHaddrange(p, cs, start, finish); 1147 } else { 1148 (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE); 1149 for (i = 0; i <= UCHAR_MAX; i++) { 1150 if (p_range_cmp(start, i) <= 0 && 1151 p_range_cmp(i, finish) <= 0 ) 1152 CHadd(p, cs, i); 1153 } 1154 } 1155 } 1156 break; 1157 } 1158 } 1159 1160 /* 1161 - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S) 1162 == static int p_b_pseudoclass(struct parse *p, char c) 1163 */ 1164 static int 1165 p_b_pseudoclass(struct parse *p, char c) { 1166 cset *cs; 1167 1168 if ((cs = allocset(p)) == NULL) 1169 return(0); 1170 1171 if (p->g->cflags®_ICASE) 1172 cs->icase = 1; 1173 1174 switch (c) { 1175 case 'W': 1176 cs->invert = 1; 1177 /* PASSTHROUGH */ 1178 case 'w': 1179 p_b_cclass_named(p, cs, "alnum"); 1180 break; 1181 case 'S': 1182 cs->invert = 1; 1183 /* PASSTHROUGH */ 1184 case 's': 1185 p_b_cclass_named(p, cs, "space"); 1186 break; 1187 default: 1188 return(0); 1189 } 1190 1191 EMIT(OANYOF, (int)(cs - p->g->sets)); 1192 return(1); 1193 } 1194 1195 /* 1196 - p_b_cclass - parse a character-class name and deal with it 1197 == static void p_b_cclass(struct parse *p, cset *cs); 1198 */ 1199 static void 1200 p_b_cclass(struct parse *p, cset *cs) 1201 { 1202 const char *sp = p->next; 1203 size_t len; 1204 char clname[16]; 1205 1206 while (MORE() && isalpha((uch)PEEK())) 1207 NEXT(); 1208 len = p->next - sp; 1209 if (len >= sizeof(clname) - 1) { 1210 SETERROR(REG_ECTYPE); 1211 return; 1212 } 1213 memcpy(clname, sp, len); 1214 clname[len] = '\0'; 1215 1216 p_b_cclass_named(p, cs, clname); 1217 } 1218 /* 1219 - p_b_cclass_named - deal with a named character class 1220 == static void p_b_cclass_named(struct parse *p, cset *cs, const char []); 1221 */ 1222 static void 1223 p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) { 1224 wctype_t wct; 1225 1226 if ((wct = wctype(clname)) == 0) { 1227 SETERROR(REG_ECTYPE); 1228 return; 1229 } 1230 CHaddtype(p, cs, wct); 1231 } 1232 1233 /* 1234 - p_b_eclass - parse an equivalence-class name and deal with it 1235 == static void p_b_eclass(struct parse *p, cset *cs); 1236 * 1237 * This implementation is incomplete. xxx 1238 */ 1239 static void 1240 p_b_eclass(struct parse *p, cset *cs) 1241 { 1242 wint_t c; 1243 1244 c = p_b_coll_elem(p, '='); 1245 CHadd(p, cs, c); 1246 } 1247 1248 /* 1249 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 1250 == static wint_t p_b_symbol(struct parse *p); 1251 */ 1252 static wint_t /* value of symbol */ 1253 p_b_symbol(struct parse *p) 1254 { 1255 wint_t value; 1256 1257 (void)REQUIRE(MORE(), REG_EBRACK); 1258 if (!EATTWO('[', '.')) 1259 return(WGETNEXT()); 1260 1261 /* collating symbol */ 1262 value = p_b_coll_elem(p, '.'); 1263 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 1264 return(value); 1265 } 1266 1267 /* 1268 - p_b_coll_elem - parse a collating-element name and look it up 1269 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 1270 */ 1271 static wint_t /* value of collating element */ 1272 p_b_coll_elem(struct parse *p, 1273 wint_t endc) /* name ended by endc,']' */ 1274 { 1275 const char *sp = p->next; 1276 struct cname *cp; 1277 mbstate_t mbs; 1278 wchar_t wc; 1279 size_t clen, len; 1280 1281 while (MORE() && !SEETWO(endc, ']')) 1282 NEXT(); 1283 if (!MORE()) { 1284 SETERROR(REG_EBRACK); 1285 return(0); 1286 } 1287 len = p->next - sp; 1288 for (cp = cnames; cp->name != NULL; cp++) 1289 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1290 return(cp->code); /* known name */ 1291 memset(&mbs, 0, sizeof(mbs)); 1292 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 1293 return (wc); /* single character */ 1294 else if (clen == (size_t)-1 || clen == (size_t)-2) 1295 SETERROR(REG_ILLSEQ); 1296 else 1297 SETERROR(REG_ECOLLATE); /* neither */ 1298 return(0); 1299 } 1300 1301 /* 1302 - may_escape - determine whether 'ch' is escape-able in the current context 1303 == static int may_escape(struct parse *p, const wint_t ch) 1304 */ 1305 static bool 1306 may_escape(struct parse *p, const wint_t ch) 1307 { 1308 1309 if ((p->pflags & PFLAG_LEGACY_ESC) != 0) 1310 return (true); 1311 if (isalpha(ch) || ch == '\'' || ch == '`') 1312 return (false); 1313 return (true); 1314 #ifdef NOTYET 1315 /* 1316 * Build a whitelist of characters that may be escaped to produce an 1317 * ordinary in the current context. This assumes that these have not 1318 * been otherwise interpreted as a special character. Escaping an 1319 * ordinary character yields undefined results according to 1320 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take 1321 * advantage of this and use escaped ordinary characters to provide 1322 * special meaning, e.g. \b, \B, \w, \W, \s, \S. 1323 */ 1324 switch(ch) { 1325 case '|': 1326 case '+': 1327 case '?': 1328 /* The above characters may not be escaped in BREs */ 1329 if (!(p->g->cflags®_EXTENDED)) 1330 return (false); 1331 /* Fallthrough */ 1332 case '(': 1333 case ')': 1334 case '{': 1335 case '}': 1336 case '.': 1337 case '[': 1338 case ']': 1339 case '\\': 1340 case '*': 1341 case '^': 1342 case '$': 1343 return (true); 1344 default: 1345 return (false); 1346 } 1347 #endif 1348 } 1349 1350 /* 1351 - othercase - return the case counterpart of an alphabetic 1352 == static wint_t othercase(wint_t ch); 1353 */ 1354 static wint_t /* if no counterpart, return ch */ 1355 othercase(wint_t ch) 1356 { 1357 assert(iswalpha(ch)); 1358 if (iswupper(ch)) 1359 return(towlower(ch)); 1360 else if (iswlower(ch)) 1361 return(towupper(ch)); 1362 else /* peculiar, but could happen */ 1363 return(ch); 1364 } 1365 1366 /* 1367 - bothcases - emit a dualcase version of a two-case character 1368 == static void bothcases(struct parse *p, wint_t ch); 1369 * 1370 * Boy, is this implementation ever a kludge... 1371 */ 1372 static void 1373 bothcases(struct parse *p, wint_t ch) 1374 { 1375 const char *oldnext = p->next; 1376 const char *oldend = p->end; 1377 char bracket[3 + MB_LEN_MAX]; 1378 size_t n; 1379 mbstate_t mbs; 1380 1381 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1382 p->next = bracket; 1383 memset(&mbs, 0, sizeof(mbs)); 1384 n = wcrtomb(bracket, ch, &mbs); 1385 assert(n != (size_t)-1); 1386 bracket[n] = ']'; 1387 bracket[n + 1] = '\0'; 1388 p->end = bracket+n+1; 1389 p_bracket(p); 1390 assert(p->next == p->end); 1391 p->next = oldnext; 1392 p->end = oldend; 1393 } 1394 1395 /* 1396 - ordinary - emit an ordinary character 1397 == static void ordinary(struct parse *p, wint_t ch); 1398 */ 1399 static void 1400 ordinary(struct parse *p, wint_t ch) 1401 { 1402 cset *cs; 1403 1404 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 1405 bothcases(p, ch); 1406 else if ((ch & OPDMASK) == ch) 1407 EMIT(OCHAR, ch); 1408 else { 1409 /* 1410 * Kludge: character is too big to fit into an OCHAR operand. 1411 * Emit a singleton set. 1412 */ 1413 if ((cs = allocset(p)) == NULL) 1414 return; 1415 CHadd(p, cs, ch); 1416 EMIT(OANYOF, (int)(cs - p->g->sets)); 1417 } 1418 } 1419 1420 /* 1421 - nonnewline - emit REG_NEWLINE version of OANY 1422 == static void nonnewline(struct parse *p); 1423 * 1424 * Boy, is this implementation ever a kludge... 1425 */ 1426 static void 1427 nonnewline(struct parse *p) 1428 { 1429 const char *oldnext = p->next; 1430 const char *oldend = p->end; 1431 char bracket[4]; 1432 1433 p->next = bracket; 1434 p->end = bracket+3; 1435 bracket[0] = '^'; 1436 bracket[1] = '\n'; 1437 bracket[2] = ']'; 1438 bracket[3] = '\0'; 1439 p_bracket(p); 1440 assert(p->next == bracket+3); 1441 p->next = oldnext; 1442 p->end = oldend; 1443 } 1444 1445 /* 1446 - repeat - generate code for a bounded repetition, recursively if needed 1447 == static void repeat(struct parse *p, sopno start, int from, int to); 1448 */ 1449 static void 1450 repeat(struct parse *p, 1451 sopno start, /* operand from here to end of strip */ 1452 int from, /* repeated from this number */ 1453 int to) /* to this number of times (maybe INFINITY) */ 1454 { 1455 sopno finish = HERE(); 1456 # define N 2 1457 # define INF 3 1458 # define REP(f, t) ((f)*8 + (t)) 1459 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1460 sopno copy; 1461 1462 if (p->error != 0) /* head off possible runaway recursion */ 1463 return; 1464 1465 assert(from <= to); 1466 1467 switch (REP(MAP(from), MAP(to))) { 1468 case REP(0, 0): /* must be user doing this */ 1469 DROP(finish-start); /* drop the operand */ 1470 break; 1471 case REP(0, 1): /* as x{1,1}? */ 1472 case REP(0, N): /* as x{1,n}? */ 1473 case REP(0, INF): /* as x{1,}? */ 1474 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1475 INSERT(OCH_, start); /* offset is wrong... */ 1476 repeat(p, start+1, 1, to); 1477 ASTERN(OOR1, start); 1478 AHEAD(start); /* ... fix it */ 1479 EMIT(OOR2, 0); 1480 AHEAD(THERE()); 1481 ASTERN(O_CH, THERETHERE()); 1482 break; 1483 case REP(1, 1): /* trivial case */ 1484 /* done */ 1485 break; 1486 case REP(1, N): /* as x?x{1,n-1} */ 1487 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1488 INSERT(OCH_, start); 1489 ASTERN(OOR1, start); 1490 AHEAD(start); 1491 EMIT(OOR2, 0); /* offset very wrong... */ 1492 AHEAD(THERE()); /* ...so fix it */ 1493 ASTERN(O_CH, THERETHERE()); 1494 copy = dupl(p, start+1, finish+1); 1495 assert(copy == finish+4); 1496 repeat(p, copy, 1, to-1); 1497 break; 1498 case REP(1, INF): /* as x+ */ 1499 INSERT(OPLUS_, start); 1500 ASTERN(O_PLUS, start); 1501 break; 1502 case REP(N, N): /* as xx{m-1,n-1} */ 1503 copy = dupl(p, start, finish); 1504 repeat(p, copy, from-1, to-1); 1505 break; 1506 case REP(N, INF): /* as xx{n-1,INF} */ 1507 copy = dupl(p, start, finish); 1508 repeat(p, copy, from-1, to); 1509 break; 1510 default: /* "can't happen" */ 1511 SETERROR(REG_ASSERT); /* just in case */ 1512 break; 1513 } 1514 } 1515 1516 /* 1517 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1518 - character from the parse struct, signals a REG_ILLSEQ error if the 1519 - character can't be converted. Returns the number of bytes consumed. 1520 */ 1521 static wint_t 1522 wgetnext(struct parse *p) 1523 { 1524 mbstate_t mbs; 1525 wchar_t wc; 1526 size_t n; 1527 1528 memset(&mbs, 0, sizeof(mbs)); 1529 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1530 if (n == (size_t)-1 || n == (size_t)-2) { 1531 SETERROR(REG_ILLSEQ); 1532 return (0); 1533 } 1534 if (n == 0) 1535 n = 1; 1536 p->next += n; 1537 return (wc); 1538 } 1539 1540 /* 1541 - seterr - set an error condition 1542 == static int seterr(struct parse *p, int e); 1543 */ 1544 static int /* useless but makes type checking happy */ 1545 seterr(struct parse *p, int e) 1546 { 1547 if (p->error == 0) /* keep earliest error condition */ 1548 p->error = e; 1549 p->next = nuls; /* try to bring things to a halt */ 1550 p->end = nuls; 1551 return(0); /* make the return value well-defined */ 1552 } 1553 1554 /* 1555 - allocset - allocate a set of characters for [] 1556 == static cset *allocset(struct parse *p); 1557 */ 1558 static cset * 1559 allocset(struct parse *p) 1560 { 1561 cset *cs, *ncs; 1562 1563 ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs)); 1564 if (ncs == NULL) { 1565 SETERROR(REG_ESPACE); 1566 return (NULL); 1567 } 1568 p->g->sets = ncs; 1569 cs = &p->g->sets[p->g->ncsets++]; 1570 memset(cs, 0, sizeof(*cs)); 1571 1572 return(cs); 1573 } 1574 1575 /* 1576 - freeset - free a now-unused set 1577 == static void freeset(struct parse *p, cset *cs); 1578 */ 1579 static void 1580 freeset(struct parse *p, cset *cs) 1581 { 1582 cset *top = &p->g->sets[p->g->ncsets]; 1583 1584 free(cs->wides); 1585 free(cs->ranges); 1586 free(cs->types); 1587 memset(cs, 0, sizeof(*cs)); 1588 if (cs == top-1) /* recover only the easy case */ 1589 p->g->ncsets--; 1590 } 1591 1592 /* 1593 - singleton - Determine whether a set contains only one character, 1594 - returning it if so, otherwise returning OUT. 1595 */ 1596 static wint_t 1597 singleton(cset *cs) 1598 { 1599 wint_t i, s, n; 1600 1601 for (i = n = 0; i < NC; i++) 1602 if (CHIN(cs, i)) { 1603 n++; 1604 s = i; 1605 } 1606 if (n == 1) 1607 return (s); 1608 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1609 cs->icase == 0) 1610 return (cs->wides[0]); 1611 /* Don't bother handling the other cases. */ 1612 return (OUT); 1613 } 1614 1615 /* 1616 - CHadd - add character to character set. 1617 */ 1618 static void 1619 CHadd(struct parse *p, cset *cs, wint_t ch) 1620 { 1621 wint_t nch, *newwides; 1622 assert(ch >= 0); 1623 if (ch < NC) 1624 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1625 else { 1626 newwides = reallocarray(cs->wides, cs->nwides + 1, 1627 sizeof(*cs->wides)); 1628 if (newwides == NULL) { 1629 SETERROR(REG_ESPACE); 1630 return; 1631 } 1632 cs->wides = newwides; 1633 cs->wides[cs->nwides++] = ch; 1634 } 1635 if (cs->icase) { 1636 if ((nch = towlower(ch)) < NC) 1637 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1638 if ((nch = towupper(ch)) < NC) 1639 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1640 } 1641 } 1642 1643 /* 1644 - CHaddrange - add all characters in the range [min,max] to a character set. 1645 */ 1646 static void 1647 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1648 { 1649 crange *newranges; 1650 1651 for (; min < NC && min <= max; min++) 1652 CHadd(p, cs, min); 1653 if (min >= max) 1654 return; 1655 newranges = reallocarray(cs->ranges, cs->nranges + 1, 1656 sizeof(*cs->ranges)); 1657 if (newranges == NULL) { 1658 SETERROR(REG_ESPACE); 1659 return; 1660 } 1661 cs->ranges = newranges; 1662 cs->ranges[cs->nranges].min = min; 1663 cs->ranges[cs->nranges].max = max; 1664 cs->nranges++; 1665 } 1666 1667 /* 1668 - CHaddtype - add all characters of a certain type to a character set. 1669 */ 1670 static void 1671 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1672 { 1673 wint_t i; 1674 wctype_t *newtypes; 1675 1676 for (i = 0; i < NC; i++) 1677 if (iswctype(i, wct)) 1678 CHadd(p, cs, i); 1679 newtypes = reallocarray(cs->types, cs->ntypes + 1, 1680 sizeof(*cs->types)); 1681 if (newtypes == NULL) { 1682 SETERROR(REG_ESPACE); 1683 return; 1684 } 1685 cs->types = newtypes; 1686 cs->types[cs->ntypes++] = wct; 1687 } 1688 1689 /* 1690 - dupl - emit a duplicate of a bunch of sops 1691 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1692 */ 1693 static sopno /* start of duplicate */ 1694 dupl(struct parse *p, 1695 sopno start, /* from here */ 1696 sopno finish) /* to this less one */ 1697 { 1698 sopno ret = HERE(); 1699 sopno len = finish - start; 1700 1701 assert(finish >= start); 1702 if (len == 0) 1703 return(ret); 1704 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1705 return(ret); 1706 (void) memcpy((char *)(p->strip + p->slen), 1707 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1708 p->slen += len; 1709 return(ret); 1710 } 1711 1712 /* 1713 - doemit - emit a strip operator 1714 == static void doemit(struct parse *p, sop op, size_t opnd); 1715 * 1716 * It might seem better to implement this as a macro with a function as 1717 * hard-case backup, but it's just too big and messy unless there are 1718 * some changes to the data structures. Maybe later. 1719 */ 1720 static void 1721 doemit(struct parse *p, sop op, size_t opnd) 1722 { 1723 /* avoid making error situations worse */ 1724 if (p->error != 0) 1725 return; 1726 1727 /* deal with oversize operands ("can't happen", more or less) */ 1728 assert(opnd < 1<<OPSHIFT); 1729 1730 /* deal with undersized strip */ 1731 if (p->slen >= p->ssize) 1732 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1733 return; 1734 1735 /* finally, it's all reduced to the easy case */ 1736 p->strip[p->slen++] = SOP(op, opnd); 1737 } 1738 1739 /* 1740 - doinsert - insert a sop into the strip 1741 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1742 */ 1743 static void 1744 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1745 { 1746 sopno sn; 1747 sop s; 1748 int i; 1749 1750 /* avoid making error situations worse */ 1751 if (p->error != 0) 1752 return; 1753 1754 sn = HERE(); 1755 EMIT(op, opnd); /* do checks, ensure space */ 1756 assert(HERE() == sn+1); 1757 s = p->strip[sn]; 1758 1759 /* adjust paren pointers */ 1760 assert(pos > 0); 1761 for (i = 1; i < NPAREN; i++) { 1762 if (p->pbegin[i] >= pos) { 1763 p->pbegin[i]++; 1764 } 1765 if (p->pend[i] >= pos) { 1766 p->pend[i]++; 1767 } 1768 } 1769 1770 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1771 (HERE()-pos-1)*sizeof(sop)); 1772 p->strip[pos] = s; 1773 } 1774 1775 /* 1776 - dofwd - complete a forward reference 1777 == static void dofwd(struct parse *p, sopno pos, sop value); 1778 */ 1779 static void 1780 dofwd(struct parse *p, sopno pos, sop value) 1781 { 1782 /* avoid making error situations worse */ 1783 if (p->error != 0) 1784 return; 1785 1786 assert(value < 1<<OPSHIFT); 1787 p->strip[pos] = OP(p->strip[pos]) | value; 1788 } 1789 1790 /* 1791 - enlarge - enlarge the strip 1792 == static int enlarge(struct parse *p, sopno size); 1793 */ 1794 static int 1795 enlarge(struct parse *p, sopno size) 1796 { 1797 sop *sp; 1798 1799 if (p->ssize >= size) 1800 return 1; 1801 1802 sp = reallocarray(p->strip, size, sizeof(sop)); 1803 if (sp == NULL) { 1804 SETERROR(REG_ESPACE); 1805 return 0; 1806 } 1807 p->strip = sp; 1808 p->ssize = size; 1809 return 1; 1810 } 1811 1812 /* 1813 - stripsnug - compact the strip 1814 == static void stripsnug(struct parse *p, struct re_guts *g); 1815 */ 1816 static void 1817 stripsnug(struct parse *p, struct re_guts *g) 1818 { 1819 g->nstates = p->slen; 1820 g->strip = reallocarray((char *)p->strip, p->slen, sizeof(sop)); 1821 if (g->strip == NULL) { 1822 SETERROR(REG_ESPACE); 1823 g->strip = p->strip; 1824 } 1825 } 1826 1827 /* 1828 - findmust - fill in must and mlen with longest mandatory literal string 1829 == static void findmust(struct parse *p, struct re_guts *g); 1830 * 1831 * This algorithm could do fancy things like analyzing the operands of | 1832 * for common subsequences. Someday. This code is simple and finds most 1833 * of the interesting cases. 1834 * 1835 * Note that must and mlen got initialized during setup. 1836 */ 1837 static void 1838 findmust(struct parse *p, struct re_guts *g) 1839 { 1840 sop *scan; 1841 sop *start = NULL; 1842 sop *newstart = NULL; 1843 sopno newlen; 1844 sop s; 1845 char *cp; 1846 int offset; 1847 char buf[MB_LEN_MAX]; 1848 size_t clen; 1849 mbstate_t mbs; 1850 1851 /* avoid making error situations worse */ 1852 if (p->error != 0) 1853 return; 1854 1855 /* 1856 * It's not generally safe to do a ``char'' substring search on 1857 * multibyte character strings, but it's safe for at least 1858 * UTF-8 (see RFC 3629). 1859 */ 1860 if (MB_CUR_MAX > 1 && 1861 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1862 return; 1863 1864 /* find the longest OCHAR sequence in strip */ 1865 newlen = 0; 1866 offset = 0; 1867 g->moffset = 0; 1868 scan = g->strip + 1; 1869 do { 1870 s = *scan++; 1871 switch (OP(s)) { 1872 case OCHAR: /* sequence member */ 1873 if (newlen == 0) { /* new sequence */ 1874 memset(&mbs, 0, sizeof(mbs)); 1875 newstart = scan - 1; 1876 } 1877 clen = wcrtomb(buf, OPND(s), &mbs); 1878 if (clen == (size_t)-1) 1879 goto toohard; 1880 newlen += clen; 1881 break; 1882 case OPLUS_: /* things that don't break one */ 1883 case OLPAREN: 1884 case ORPAREN: 1885 break; 1886 case OQUEST_: /* things that must be skipped */ 1887 case OCH_: 1888 offset = altoffset(scan, offset); 1889 scan--; 1890 do { 1891 scan += OPND(s); 1892 s = *scan; 1893 /* assert() interferes w debug printouts */ 1894 if (OP(s) != (sop)O_QUEST && 1895 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) { 1896 g->iflags |= BAD; 1897 return; 1898 } 1899 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); 1900 /* FALLTHROUGH */ 1901 case OBOW: /* things that break a sequence */ 1902 case OEOW: 1903 case OBOL: 1904 case OEOL: 1905 case OBOS: 1906 case OEOS: 1907 case OWBND: 1908 case ONWBND: 1909 case O_QUEST: 1910 case O_CH: 1911 case OEND: 1912 if (newlen > (sopno)g->mlen) { /* ends one */ 1913 start = newstart; 1914 g->mlen = newlen; 1915 if (offset > -1) { 1916 g->moffset += offset; 1917 offset = newlen; 1918 } else 1919 g->moffset = offset; 1920 } else { 1921 if (offset > -1) 1922 offset += newlen; 1923 } 1924 newlen = 0; 1925 break; 1926 case OANY: 1927 if (newlen > (sopno)g->mlen) { /* ends one */ 1928 start = newstart; 1929 g->mlen = newlen; 1930 if (offset > -1) { 1931 g->moffset += offset; 1932 offset = newlen; 1933 } else 1934 g->moffset = offset; 1935 } else { 1936 if (offset > -1) 1937 offset += newlen; 1938 } 1939 if (offset > -1) 1940 offset++; 1941 newlen = 0; 1942 break; 1943 case OANYOF: /* may or may not invalidate offset */ 1944 /* First, everything as OANY */ 1945 if (newlen > (sopno)g->mlen) { /* ends one */ 1946 start = newstart; 1947 g->mlen = newlen; 1948 if (offset > -1) { 1949 g->moffset += offset; 1950 offset = newlen; 1951 } else 1952 g->moffset = offset; 1953 } else { 1954 if (offset > -1) 1955 offset += newlen; 1956 } 1957 if (offset > -1) 1958 offset++; 1959 newlen = 0; 1960 break; 1961 toohard: 1962 default: 1963 /* Anything here makes it impossible or too hard 1964 * to calculate the offset -- so we give up; 1965 * save the last known good offset, in case the 1966 * must sequence doesn't occur later. 1967 */ 1968 if (newlen > (sopno)g->mlen) { /* ends one */ 1969 start = newstart; 1970 g->mlen = newlen; 1971 if (offset > -1) 1972 g->moffset += offset; 1973 else 1974 g->moffset = offset; 1975 } 1976 offset = -1; 1977 newlen = 0; 1978 break; 1979 } 1980 } while (OP(s) != OEND); 1981 1982 if (g->mlen == 0) { /* there isn't one */ 1983 g->moffset = -1; 1984 return; 1985 } 1986 1987 /* turn it into a character string */ 1988 g->must = malloc((size_t)g->mlen + 1); 1989 if (g->must == NULL) { /* argh; just forget it */ 1990 g->mlen = 0; 1991 g->moffset = -1; 1992 return; 1993 } 1994 cp = g->must; 1995 scan = start; 1996 memset(&mbs, 0, sizeof(mbs)); 1997 while (cp < g->must + g->mlen) { 1998 while (OP(s = *scan++) != OCHAR) 1999 continue; 2000 clen = wcrtomb(cp, OPND(s), &mbs); 2001 assert(clen != (size_t)-1); 2002 cp += clen; 2003 } 2004 assert(cp == g->must + g->mlen); 2005 *cp++ = '\0'; /* just on general principles */ 2006 } 2007 2008 /* 2009 - altoffset - choose biggest offset among multiple choices 2010 == static int altoffset(sop *scan, int offset); 2011 * 2012 * Compute, recursively if necessary, the largest offset among multiple 2013 * re paths. 2014 */ 2015 static int 2016 altoffset(sop *scan, int offset) 2017 { 2018 int largest; 2019 int try; 2020 sop s; 2021 2022 /* If we gave up already on offsets, return */ 2023 if (offset == -1) 2024 return -1; 2025 2026 largest = 0; 2027 try = 0; 2028 s = *scan++; 2029 while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) { 2030 switch (OP(s)) { 2031 case OOR1: 2032 if (try > largest) 2033 largest = try; 2034 try = 0; 2035 break; 2036 case OQUEST_: 2037 case OCH_: 2038 try = altoffset(scan, try); 2039 if (try == -1) 2040 return -1; 2041 scan--; 2042 do { 2043 scan += OPND(s); 2044 s = *scan; 2045 if (OP(s) != (sop)O_QUEST && 2046 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) 2047 return -1; 2048 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); 2049 /* We must skip to the next position, or we'll 2050 * leave altoffset() too early. 2051 */ 2052 scan++; 2053 break; 2054 case OANYOF: 2055 case OCHAR: 2056 case OANY: 2057 try++; 2058 case OBOW: 2059 case OEOW: 2060 case OWBND: 2061 case ONWBND: 2062 case OLPAREN: 2063 case ORPAREN: 2064 case OOR2: 2065 break; 2066 default: 2067 try = -1; 2068 break; 2069 } 2070 if (try == -1) 2071 return -1; 2072 s = *scan++; 2073 } 2074 2075 if (try > largest) 2076 largest = try; 2077 2078 return largest+offset; 2079 } 2080 2081 /* 2082 - computejumps - compute char jumps for BM scan 2083 == static void computejumps(struct parse *p, struct re_guts *g); 2084 * 2085 * This algorithm assumes g->must exists and is has size greater than 2086 * zero. It's based on the algorithm found on Computer Algorithms by 2087 * Sara Baase. 2088 * 2089 * A char jump is the number of characters one needs to jump based on 2090 * the value of the character from the text that was mismatched. 2091 */ 2092 static void 2093 computejumps(struct parse *p, struct re_guts *g) 2094 { 2095 int ch; 2096 int mindex; 2097 2098 /* Avoid making errors worse */ 2099 if (p->error != 0) 2100 return; 2101 2102 g->charjump = (int *)malloc((NC_MAX + 1) * sizeof(int)); 2103 if (g->charjump == NULL) /* Not a fatal error */ 2104 return; 2105 /* Adjust for signed chars, if necessary */ 2106 g->charjump = &g->charjump[-(CHAR_MIN)]; 2107 2108 /* If the character does not exist in the pattern, the jump 2109 * is equal to the number of characters in the pattern. 2110 */ 2111 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 2112 g->charjump[ch] = g->mlen; 2113 2114 /* If the character does exist, compute the jump that would 2115 * take us to the last character in the pattern equal to it 2116 * (notice that we match right to left, so that last character 2117 * is the first one that would be matched). 2118 */ 2119 for (mindex = 0; mindex < g->mlen; mindex++) 2120 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 2121 } 2122 2123 /* 2124 - computematchjumps - compute match jumps for BM scan 2125 == static void computematchjumps(struct parse *p, struct re_guts *g); 2126 * 2127 * This algorithm assumes g->must exists and is has size greater than 2128 * zero. It's based on the algorithm found on Computer Algorithms by 2129 * Sara Baase. 2130 * 2131 * A match jump is the number of characters one needs to advance based 2132 * on the already-matched suffix. 2133 * Notice that all values here are minus (g->mlen-1), because of the way 2134 * the search algorithm works. 2135 */ 2136 static void 2137 computematchjumps(struct parse *p, struct re_guts *g) 2138 { 2139 int mindex; /* General "must" iterator */ 2140 int suffix; /* Keeps track of matching suffix */ 2141 int ssuffix; /* Keeps track of suffixes' suffix */ 2142 int* pmatches; /* pmatches[k] points to the next i 2143 * such that i+1...mlen is a substring 2144 * of k+1...k+mlen-i-1 2145 */ 2146 2147 /* Avoid making errors worse */ 2148 if (p->error != 0) 2149 return; 2150 2151 pmatches = (int*) malloc(g->mlen * sizeof(int)); 2152 if (pmatches == NULL) { 2153 g->matchjump = NULL; 2154 return; 2155 } 2156 2157 g->matchjump = (int*) malloc(g->mlen * sizeof(int)); 2158 if (g->matchjump == NULL) { /* Not a fatal error */ 2159 free(pmatches); 2160 return; 2161 } 2162 2163 /* Set maximum possible jump for each character in the pattern */ 2164 for (mindex = 0; mindex < g->mlen; mindex++) 2165 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 2166 2167 /* Compute pmatches[] */ 2168 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 2169 mindex--, suffix--) { 2170 pmatches[mindex] = suffix; 2171 2172 /* If a mismatch is found, interrupting the substring, 2173 * compute the matchjump for that position. If no 2174 * mismatch is found, then a text substring mismatched 2175 * against the suffix will also mismatch against the 2176 * substring. 2177 */ 2178 while (suffix < g->mlen 2179 && g->must[mindex] != g->must[suffix]) { 2180 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2181 g->mlen - mindex - 1); 2182 suffix = pmatches[suffix]; 2183 } 2184 } 2185 2186 /* Compute the matchjump up to the last substring found to jump 2187 * to the beginning of the largest must pattern prefix matching 2188 * it's own suffix. 2189 */ 2190 for (mindex = 0; mindex <= suffix; mindex++) 2191 g->matchjump[mindex] = MIN(g->matchjump[mindex], 2192 g->mlen + suffix - mindex); 2193 2194 ssuffix = pmatches[suffix]; 2195 while (suffix < g->mlen) { 2196 while (suffix <= ssuffix && suffix < g->mlen) { 2197 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2198 g->mlen + ssuffix - suffix); 2199 suffix++; 2200 } 2201 if (suffix < g->mlen) 2202 ssuffix = pmatches[ssuffix]; 2203 } 2204 2205 free(pmatches); 2206 } 2207 2208 /* 2209 - pluscount - count + nesting 2210 == static sopno pluscount(struct parse *p, struct re_guts *g); 2211 */ 2212 static sopno /* nesting depth */ 2213 pluscount(struct parse *p, struct re_guts *g) 2214 { 2215 sop *scan; 2216 sop s; 2217 sopno plusnest = 0; 2218 sopno maxnest = 0; 2219 2220 if (p->error != 0) 2221 return(0); /* there may not be an OEND */ 2222 2223 scan = g->strip + 1; 2224 do { 2225 s = *scan++; 2226 switch (OP(s)) { 2227 case OPLUS_: 2228 plusnest++; 2229 break; 2230 case O_PLUS: 2231 if (plusnest > maxnest) 2232 maxnest = plusnest; 2233 plusnest--; 2234 break; 2235 } 2236 } while (OP(s) != OEND); 2237 if (plusnest != 0) 2238 g->iflags |= BAD; 2239 return(maxnest); 2240 } 2241