1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * Copyright (c) 2011 The FreeBSD Foundation 7 * All rights reserved. 8 * Portions of this software were developed by David Chisnall 9 * under sponsorship from the FreeBSD Foundation. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * Henry Spencer. 13 * 14 * Copyright (c) 2011 The FreeBSD Foundation 15 * All rights reserved. 16 * Portions of this software were developed by David Chisnall 17 * under sponsorship from the FreeBSD Foundation. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 44 */ 45 46 #if defined(LIBC_SCCS) && !defined(lint) 47 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 48 #endif /* LIBC_SCCS and not lint */ 49 #include <sys/cdefs.h> 50 __FBSDID("$FreeBSD$"); 51 52 #include <sys/types.h> 53 #include <stdio.h> 54 #include <string.h> 55 #include <ctype.h> 56 #include <limits.h> 57 #include <stdlib.h> 58 #include <regex.h> 59 #include <runetype.h> 60 #include <wchar.h> 61 #include <wctype.h> 62 63 #include "collate.h" 64 65 #include "utils.h" 66 #include "regex2.h" 67 68 #include "cname.h" 69 70 /* 71 * parse structure, passed up and down to avoid global variables and 72 * other clumsinesses 73 */ 74 struct parse { 75 char *next; /* next character in RE */ 76 char *end; /* end of string (-> NUL normally) */ 77 int error; /* has an error been seen? */ 78 sop *strip; /* malloced strip */ 79 sopno ssize; /* malloced strip size (allocated) */ 80 sopno slen; /* malloced strip length (used) */ 81 int ncsalloc; /* number of csets allocated */ 82 struct re_guts *g; 83 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 84 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 85 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 86 }; 87 88 /* ========= begin header generated by ./mkh ========= */ 89 #ifdef __cplusplus 90 extern "C" { 91 #endif 92 93 /* === regcomp.c === */ 94 static void p_ere(struct parse *p, int stop); 95 static void p_ere_exp(struct parse *p); 96 static void p_str(struct parse *p); 97 static void p_bre(struct parse *p, int end1, int end2); 98 static int p_simp_re(struct parse *p, int starordinary); 99 static int p_count(struct parse *p); 100 static void p_bracket(struct parse *p); 101 static void p_b_term(struct parse *p, cset *cs); 102 static void p_b_cclass(struct parse *p, cset *cs); 103 static void p_b_eclass(struct parse *p, cset *cs); 104 static wint_t p_b_symbol(struct parse *p); 105 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 106 static wint_t othercase(wint_t ch); 107 static void bothcases(struct parse *p, wint_t ch); 108 static void ordinary(struct parse *p, wint_t ch); 109 static void nonnewline(struct parse *p); 110 static void repeat(struct parse *p, sopno start, int from, int to); 111 static int seterr(struct parse *p, int e); 112 static cset *allocset(struct parse *p); 113 static void freeset(struct parse *p, cset *cs); 114 static void CHadd(struct parse *p, cset *cs, wint_t ch); 115 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 116 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 117 static wint_t singleton(cset *cs); 118 static sopno dupl(struct parse *p, sopno start, sopno finish); 119 static void doemit(struct parse *p, sop op, size_t opnd); 120 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 121 static void dofwd(struct parse *p, sopno pos, sop value); 122 static int enlarge(struct parse *p, sopno size); 123 static void stripsnug(struct parse *p, struct re_guts *g); 124 static void findmust(struct parse *p, struct re_guts *g); 125 static int altoffset(sop *scan, int offset); 126 static void computejumps(struct parse *p, struct re_guts *g); 127 static void computematchjumps(struct parse *p, struct re_guts *g); 128 static sopno pluscount(struct parse *p, struct re_guts *g); 129 static wint_t wgetnext(struct parse *p); 130 131 #ifdef __cplusplus 132 } 133 #endif 134 /* ========= end header generated by ./mkh ========= */ 135 136 static char nuls[10]; /* place to point scanner in event of error */ 137 138 /* 139 * macros for use with parse structure 140 * BEWARE: these know that the parse structure is named `p' !!! 141 */ 142 #define PEEK() (*p->next) 143 #define PEEK2() (*(p->next+1)) 144 #define MORE() (p->next < p->end) 145 #define MORE2() (p->next+1 < p->end) 146 #define SEE(c) (MORE() && PEEK() == (c)) 147 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 148 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 149 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 150 #define NEXT() (p->next++) 151 #define NEXT2() (p->next += 2) 152 #define NEXTn(n) (p->next += (n)) 153 #define GETNEXT() (*p->next++) 154 #define WGETNEXT() wgetnext(p) 155 #define SETERROR(e) seterr(p, (e)) 156 #define REQUIRE(co, e) ((co) || SETERROR(e)) 157 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 158 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 159 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 160 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 161 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 162 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 163 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 164 #define HERE() (p->slen) 165 #define THERE() (p->slen - 1) 166 #define THERETHERE() (p->slen - 2) 167 #define DROP(n) (p->slen -= (n)) 168 169 #ifndef NDEBUG 170 static int never = 0; /* for use in asserts; shuts lint up */ 171 #else 172 #define never 0 /* some <assert.h>s have bugs too */ 173 #endif 174 175 /* Macro used by computejump()/computematchjump() */ 176 #define MIN(a,b) ((a)<(b)?(a):(b)) 177 178 /* 179 - regcomp - interface for parser and compilation 180 = extern int regcomp(regex_t *, const char *, int); 181 = #define REG_BASIC 0000 182 = #define REG_EXTENDED 0001 183 = #define REG_ICASE 0002 184 = #define REG_NOSUB 0004 185 = #define REG_NEWLINE 0010 186 = #define REG_NOSPEC 0020 187 = #define REG_PEND 0040 188 = #define REG_DUMP 0200 189 */ 190 int /* 0 success, otherwise REG_something */ 191 regcomp(regex_t * __restrict preg, 192 const char * __restrict pattern, 193 int cflags) 194 { 195 struct parse pa; 196 struct re_guts *g; 197 struct parse *p = &pa; 198 int i; 199 size_t len; 200 #ifdef REDEBUG 201 # define GOODFLAGS(f) (f) 202 #else 203 # define GOODFLAGS(f) ((f)&~REG_DUMP) 204 #endif 205 206 cflags = GOODFLAGS(cflags); 207 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 208 return(REG_INVARG); 209 210 if (cflags®_PEND) { 211 if (preg->re_endp < pattern) 212 return(REG_INVARG); 213 len = preg->re_endp - pattern; 214 } else 215 len = strlen((char *)pattern); 216 217 /* do the mallocs early so failure handling is easy */ 218 g = (struct re_guts *)malloc(sizeof(struct re_guts)); 219 if (g == NULL) 220 return(REG_ESPACE); 221 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 222 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 223 p->slen = 0; 224 if (p->strip == NULL) { 225 free((char *)g); 226 return(REG_ESPACE); 227 } 228 229 /* set things up */ 230 p->g = g; 231 p->next = (char *)pattern; /* convenience; we do not modify it */ 232 p->end = p->next + len; 233 p->error = 0; 234 p->ncsalloc = 0; 235 for (i = 0; i < NPAREN; i++) { 236 p->pbegin[i] = 0; 237 p->pend[i] = 0; 238 } 239 g->sets = NULL; 240 g->ncsets = 0; 241 g->cflags = cflags; 242 g->iflags = 0; 243 g->nbol = 0; 244 g->neol = 0; 245 g->must = NULL; 246 g->moffset = -1; 247 g->charjump = NULL; 248 g->matchjump = NULL; 249 g->mlen = 0; 250 g->nsub = 0; 251 g->backrefs = 0; 252 253 /* do it */ 254 EMIT(OEND, 0); 255 g->firststate = THERE(); 256 if (cflags®_EXTENDED) 257 p_ere(p, OUT); 258 else if (cflags®_NOSPEC) 259 p_str(p); 260 else 261 p_bre(p, OUT, OUT); 262 EMIT(OEND, 0); 263 g->laststate = THERE(); 264 265 /* tidy up loose ends and fill things in */ 266 stripsnug(p, g); 267 findmust(p, g); 268 /* only use Boyer-Moore algorithm if the pattern is bigger 269 * than three characters 270 */ 271 if(g->mlen > 3) { 272 computejumps(p, g); 273 computematchjumps(p, g); 274 if(g->matchjump == NULL && g->charjump != NULL) { 275 free(g->charjump); 276 g->charjump = NULL; 277 } 278 } 279 g->nplus = pluscount(p, g); 280 g->magic = MAGIC2; 281 preg->re_nsub = g->nsub; 282 preg->re_g = g; 283 preg->re_magic = MAGIC1; 284 #ifndef REDEBUG 285 /* not debugging, so can't rely on the assert() in regexec() */ 286 if (g->iflags&BAD) 287 SETERROR(REG_ASSERT); 288 #endif 289 290 /* win or lose, we're done */ 291 if (p->error != 0) /* lose */ 292 regfree(preg); 293 return(p->error); 294 } 295 296 /* 297 - p_ere - ERE parser top level, concatenation and alternation 298 == static void p_ere(struct parse *p, int_t stop); 299 */ 300 static void 301 p_ere(struct parse *p, 302 int stop) /* character this ERE should end at */ 303 { 304 char c; 305 sopno prevback; 306 sopno prevfwd; 307 sopno conc; 308 int first = 1; /* is this the first alternative? */ 309 310 for (;;) { 311 /* do a bunch of concatenated expressions */ 312 conc = HERE(); 313 while (MORE() && (c = PEEK()) != '|' && c != stop) 314 p_ere_exp(p); 315 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 316 317 if (!EAT('|')) 318 break; /* NOTE BREAK OUT */ 319 320 if (first) { 321 INSERT(OCH_, conc); /* offset is wrong */ 322 prevfwd = conc; 323 prevback = conc; 324 first = 0; 325 } 326 ASTERN(OOR1, prevback); 327 prevback = THERE(); 328 AHEAD(prevfwd); /* fix previous offset */ 329 prevfwd = HERE(); 330 EMIT(OOR2, 0); /* offset is very wrong */ 331 } 332 333 if (!first) { /* tail-end fixups */ 334 AHEAD(prevfwd); 335 ASTERN(O_CH, prevback); 336 } 337 338 assert(!MORE() || SEE(stop)); 339 } 340 341 /* 342 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 343 == static void p_ere_exp(struct parse *p); 344 */ 345 static void 346 p_ere_exp(struct parse *p) 347 { 348 char c; 349 wint_t wc; 350 sopno pos; 351 int count; 352 int count2; 353 sopno subno; 354 int wascaret = 0; 355 356 assert(MORE()); /* caller should have ensured this */ 357 c = GETNEXT(); 358 359 pos = HERE(); 360 switch (c) { 361 case '(': 362 (void)REQUIRE(MORE(), REG_EPAREN); 363 p->g->nsub++; 364 subno = p->g->nsub; 365 if (subno < NPAREN) 366 p->pbegin[subno] = HERE(); 367 EMIT(OLPAREN, subno); 368 if (!SEE(')')) 369 p_ere(p, ')'); 370 if (subno < NPAREN) { 371 p->pend[subno] = HERE(); 372 assert(p->pend[subno] != 0); 373 } 374 EMIT(ORPAREN, subno); 375 (void)MUSTEAT(')', REG_EPAREN); 376 break; 377 #ifndef POSIX_MISTAKE 378 case ')': /* happens only if no current unmatched ( */ 379 /* 380 * You may ask, why the ifndef? Because I didn't notice 381 * this until slightly too late for 1003.2, and none of the 382 * other 1003.2 regular-expression reviewers noticed it at 383 * all. So an unmatched ) is legal POSIX, at least until 384 * we can get it fixed. 385 */ 386 SETERROR(REG_EPAREN); 387 break; 388 #endif 389 case '^': 390 EMIT(OBOL, 0); 391 p->g->iflags |= USEBOL; 392 p->g->nbol++; 393 wascaret = 1; 394 break; 395 case '$': 396 EMIT(OEOL, 0); 397 p->g->iflags |= USEEOL; 398 p->g->neol++; 399 break; 400 case '|': 401 SETERROR(REG_EMPTY); 402 break; 403 case '*': 404 case '+': 405 case '?': 406 SETERROR(REG_BADRPT); 407 break; 408 case '.': 409 if (p->g->cflags®_NEWLINE) 410 nonnewline(p); 411 else 412 EMIT(OANY, 0); 413 break; 414 case '[': 415 p_bracket(p); 416 break; 417 case '\\': 418 (void)REQUIRE(MORE(), REG_EESCAPE); 419 wc = WGETNEXT(); 420 ordinary(p, wc); 421 break; 422 case '{': /* okay as ordinary except if digit follows */ 423 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 424 /* FALLTHROUGH */ 425 default: 426 p->next--; 427 wc = WGETNEXT(); 428 ordinary(p, wc); 429 break; 430 } 431 432 if (!MORE()) 433 return; 434 c = PEEK(); 435 /* we call { a repetition if followed by a digit */ 436 if (!( c == '*' || c == '+' || c == '?' || 437 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 438 return; /* no repetition, we're done */ 439 NEXT(); 440 441 (void)REQUIRE(!wascaret, REG_BADRPT); 442 switch (c) { 443 case '*': /* implemented as +? */ 444 /* this case does not require the (y|) trick, noKLUDGE */ 445 INSERT(OPLUS_, pos); 446 ASTERN(O_PLUS, pos); 447 INSERT(OQUEST_, pos); 448 ASTERN(O_QUEST, pos); 449 break; 450 case '+': 451 INSERT(OPLUS_, pos); 452 ASTERN(O_PLUS, pos); 453 break; 454 case '?': 455 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 456 INSERT(OCH_, pos); /* offset slightly wrong */ 457 ASTERN(OOR1, pos); /* this one's right */ 458 AHEAD(pos); /* fix the OCH_ */ 459 EMIT(OOR2, 0); /* offset very wrong... */ 460 AHEAD(THERE()); /* ...so fix it */ 461 ASTERN(O_CH, THERETHERE()); 462 break; 463 case '{': 464 count = p_count(p); 465 if (EAT(',')) { 466 if (isdigit((uch)PEEK())) { 467 count2 = p_count(p); 468 (void)REQUIRE(count <= count2, REG_BADBR); 469 } else /* single number with comma */ 470 count2 = INFINITY; 471 } else /* just a single number */ 472 count2 = count; 473 repeat(p, pos, count, count2); 474 if (!EAT('}')) { /* error heuristics */ 475 while (MORE() && PEEK() != '}') 476 NEXT(); 477 (void)REQUIRE(MORE(), REG_EBRACE); 478 SETERROR(REG_BADBR); 479 } 480 break; 481 } 482 483 if (!MORE()) 484 return; 485 c = PEEK(); 486 if (!( c == '*' || c == '+' || c == '?' || 487 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 488 return; 489 SETERROR(REG_BADRPT); 490 } 491 492 /* 493 - p_str - string (no metacharacters) "parser" 494 == static void p_str(struct parse *p); 495 */ 496 static void 497 p_str(struct parse *p) 498 { 499 (void)REQUIRE(MORE(), REG_EMPTY); 500 while (MORE()) 501 ordinary(p, WGETNEXT()); 502 } 503 504 /* 505 - p_bre - BRE parser top level, anchoring and concatenation 506 == static void p_bre(struct parse *p, int end1, \ 507 == int end2); 508 * Giving end1 as OUT essentially eliminates the end1/end2 check. 509 * 510 * This implementation is a bit of a kludge, in that a trailing $ is first 511 * taken as an ordinary character and then revised to be an anchor. 512 * The amount of lookahead needed to avoid this kludge is excessive. 513 */ 514 static void 515 p_bre(struct parse *p, 516 int end1, /* first terminating character */ 517 int end2) /* second terminating character */ 518 { 519 sopno start = HERE(); 520 int first = 1; /* first subexpression? */ 521 int wasdollar = 0; 522 523 if (EAT('^')) { 524 EMIT(OBOL, 0); 525 p->g->iflags |= USEBOL; 526 p->g->nbol++; 527 } 528 while (MORE() && !SEETWO(end1, end2)) { 529 wasdollar = p_simp_re(p, first); 530 first = 0; 531 } 532 if (wasdollar) { /* oops, that was a trailing anchor */ 533 DROP(1); 534 EMIT(OEOL, 0); 535 p->g->iflags |= USEEOL; 536 p->g->neol++; 537 } 538 539 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 540 } 541 542 /* 543 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 544 == static int p_simp_re(struct parse *p, int starordinary); 545 */ 546 static int /* was the simple RE an unbackslashed $? */ 547 p_simp_re(struct parse *p, 548 int starordinary) /* is a leading * an ordinary character? */ 549 { 550 int c; 551 int count; 552 int count2; 553 sopno pos; 554 int i; 555 wint_t wc; 556 sopno subno; 557 # define BACKSL (1<<CHAR_BIT) 558 559 pos = HERE(); /* repetion op, if any, covers from here */ 560 561 assert(MORE()); /* caller should have ensured this */ 562 c = GETNEXT(); 563 if (c == '\\') { 564 (void)REQUIRE(MORE(), REG_EESCAPE); 565 c = BACKSL | GETNEXT(); 566 } 567 switch (c) { 568 case '.': 569 if (p->g->cflags®_NEWLINE) 570 nonnewline(p); 571 else 572 EMIT(OANY, 0); 573 break; 574 case '[': 575 p_bracket(p); 576 break; 577 case BACKSL|'{': 578 SETERROR(REG_BADRPT); 579 break; 580 case BACKSL|'(': 581 p->g->nsub++; 582 subno = p->g->nsub; 583 if (subno < NPAREN) 584 p->pbegin[subno] = HERE(); 585 EMIT(OLPAREN, subno); 586 /* the MORE here is an error heuristic */ 587 if (MORE() && !SEETWO('\\', ')')) 588 p_bre(p, '\\', ')'); 589 if (subno < NPAREN) { 590 p->pend[subno] = HERE(); 591 assert(p->pend[subno] != 0); 592 } 593 EMIT(ORPAREN, subno); 594 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 595 break; 596 case BACKSL|')': /* should not get here -- must be user */ 597 case BACKSL|'}': 598 SETERROR(REG_EPAREN); 599 break; 600 case BACKSL|'1': 601 case BACKSL|'2': 602 case BACKSL|'3': 603 case BACKSL|'4': 604 case BACKSL|'5': 605 case BACKSL|'6': 606 case BACKSL|'7': 607 case BACKSL|'8': 608 case BACKSL|'9': 609 i = (c&~BACKSL) - '0'; 610 assert(i < NPAREN); 611 if (p->pend[i] != 0) { 612 assert(i <= p->g->nsub); 613 EMIT(OBACK_, i); 614 assert(p->pbegin[i] != 0); 615 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 616 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 617 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 618 EMIT(O_BACK, i); 619 } else 620 SETERROR(REG_ESUBREG); 621 p->g->backrefs = 1; 622 break; 623 case '*': 624 (void)REQUIRE(starordinary, REG_BADRPT); 625 /* FALLTHROUGH */ 626 default: 627 p->next--; 628 wc = WGETNEXT(); 629 ordinary(p, wc); 630 break; 631 } 632 633 if (EAT('*')) { /* implemented as +? */ 634 /* this case does not require the (y|) trick, noKLUDGE */ 635 INSERT(OPLUS_, pos); 636 ASTERN(O_PLUS, pos); 637 INSERT(OQUEST_, pos); 638 ASTERN(O_QUEST, pos); 639 } else if (EATTWO('\\', '{')) { 640 count = p_count(p); 641 if (EAT(',')) { 642 if (MORE() && isdigit((uch)PEEK())) { 643 count2 = p_count(p); 644 (void)REQUIRE(count <= count2, REG_BADBR); 645 } else /* single number with comma */ 646 count2 = INFINITY; 647 } else /* just a single number */ 648 count2 = count; 649 repeat(p, pos, count, count2); 650 if (!EATTWO('\\', '}')) { /* error heuristics */ 651 while (MORE() && !SEETWO('\\', '}')) 652 NEXT(); 653 (void)REQUIRE(MORE(), REG_EBRACE); 654 SETERROR(REG_BADBR); 655 } 656 } else if (c == '$') /* $ (but not \$) ends it */ 657 return(1); 658 659 return(0); 660 } 661 662 /* 663 - p_count - parse a repetition count 664 == static int p_count(struct parse *p); 665 */ 666 static int /* the value */ 667 p_count(struct parse *p) 668 { 669 int count = 0; 670 int ndigits = 0; 671 672 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 673 count = count*10 + (GETNEXT() - '0'); 674 ndigits++; 675 } 676 677 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 678 return(count); 679 } 680 681 /* 682 - p_bracket - parse a bracketed character list 683 == static void p_bracket(struct parse *p); 684 */ 685 static void 686 p_bracket(struct parse *p) 687 { 688 cset *cs; 689 wint_t ch; 690 691 /* Dept of Truly Sickening Special-Case Kludges */ 692 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 693 EMIT(OBOW, 0); 694 NEXTn(6); 695 return; 696 } 697 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 698 EMIT(OEOW, 0); 699 NEXTn(6); 700 return; 701 } 702 703 if ((cs = allocset(p)) == NULL) 704 return; 705 706 if (p->g->cflags®_ICASE) 707 cs->icase = 1; 708 if (EAT('^')) 709 cs->invert = 1; 710 if (EAT(']')) 711 CHadd(p, cs, ']'); 712 else if (EAT('-')) 713 CHadd(p, cs, '-'); 714 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 715 p_b_term(p, cs); 716 if (EAT('-')) 717 CHadd(p, cs, '-'); 718 (void)MUSTEAT(']', REG_EBRACK); 719 720 if (p->error != 0) /* don't mess things up further */ 721 return; 722 723 if (cs->invert && p->g->cflags®_NEWLINE) 724 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 725 726 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 727 ordinary(p, ch); 728 freeset(p, cs); 729 } else 730 EMIT(OANYOF, (int)(cs - p->g->sets)); 731 } 732 733 /* 734 - p_b_term - parse one term of a bracketed character list 735 == static void p_b_term(struct parse *p, cset *cs); 736 */ 737 static void 738 p_b_term(struct parse *p, cset *cs) 739 { 740 char c; 741 wint_t start, finish; 742 wint_t i; 743 struct xlocale_collate *table = 744 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 745 746 /* classify what we've got */ 747 switch ((MORE()) ? PEEK() : '\0') { 748 case '[': 749 c = (MORE2()) ? PEEK2() : '\0'; 750 break; 751 case '-': 752 SETERROR(REG_ERANGE); 753 return; /* NOTE RETURN */ 754 break; 755 default: 756 c = '\0'; 757 break; 758 } 759 760 switch (c) { 761 case ':': /* character class */ 762 NEXT2(); 763 (void)REQUIRE(MORE(), REG_EBRACK); 764 c = PEEK(); 765 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 766 p_b_cclass(p, cs); 767 (void)REQUIRE(MORE(), REG_EBRACK); 768 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 769 break; 770 case '=': /* equivalence class */ 771 NEXT2(); 772 (void)REQUIRE(MORE(), REG_EBRACK); 773 c = PEEK(); 774 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 775 p_b_eclass(p, cs); 776 (void)REQUIRE(MORE(), REG_EBRACK); 777 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 778 break; 779 default: /* symbol, ordinary character, or range */ 780 start = p_b_symbol(p); 781 if (SEE('-') && MORE2() && PEEK2() != ']') { 782 /* range */ 783 NEXT(); 784 if (EAT('-')) 785 finish = '-'; 786 else 787 finish = p_b_symbol(p); 788 } else 789 finish = start; 790 if (start == finish) 791 CHadd(p, cs, start); 792 else { 793 if (table->__collate_load_error) { 794 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 795 CHaddrange(p, cs, start, finish); 796 } else { 797 (void)REQUIRE(__collate_range_cmp(table, start, finish) <= 0, REG_ERANGE); 798 for (i = 0; i <= UCHAR_MAX; i++) { 799 if ( __collate_range_cmp(table, start, i) <= 0 800 && __collate_range_cmp(table, i, finish) <= 0 801 ) 802 CHadd(p, cs, i); 803 } 804 } 805 } 806 break; 807 } 808 } 809 810 /* 811 - p_b_cclass - parse a character-class name and deal with it 812 == static void p_b_cclass(struct parse *p, cset *cs); 813 */ 814 static void 815 p_b_cclass(struct parse *p, cset *cs) 816 { 817 char *sp = p->next; 818 size_t len; 819 wctype_t wct; 820 char clname[16]; 821 822 while (MORE() && isalpha((uch)PEEK())) 823 NEXT(); 824 len = p->next - sp; 825 if (len >= sizeof(clname) - 1) { 826 SETERROR(REG_ECTYPE); 827 return; 828 } 829 memcpy(clname, sp, len); 830 clname[len] = '\0'; 831 if ((wct = wctype(clname)) == 0) { 832 SETERROR(REG_ECTYPE); 833 return; 834 } 835 CHaddtype(p, cs, wct); 836 } 837 838 /* 839 - p_b_eclass - parse an equivalence-class name and deal with it 840 == static void p_b_eclass(struct parse *p, cset *cs); 841 * 842 * This implementation is incomplete. xxx 843 */ 844 static void 845 p_b_eclass(struct parse *p, cset *cs) 846 { 847 wint_t c; 848 849 c = p_b_coll_elem(p, '='); 850 CHadd(p, cs, c); 851 } 852 853 /* 854 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 855 == static wint_t p_b_symbol(struct parse *p); 856 */ 857 static wint_t /* value of symbol */ 858 p_b_symbol(struct parse *p) 859 { 860 wint_t value; 861 862 (void)REQUIRE(MORE(), REG_EBRACK); 863 if (!EATTWO('[', '.')) 864 return(WGETNEXT()); 865 866 /* collating symbol */ 867 value = p_b_coll_elem(p, '.'); 868 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 869 return(value); 870 } 871 872 /* 873 - p_b_coll_elem - parse a collating-element name and look it up 874 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 875 */ 876 static wint_t /* value of collating element */ 877 p_b_coll_elem(struct parse *p, 878 wint_t endc) /* name ended by endc,']' */ 879 { 880 char *sp = p->next; 881 struct cname *cp; 882 int len; 883 mbstate_t mbs; 884 wchar_t wc; 885 size_t clen; 886 887 while (MORE() && !SEETWO(endc, ']')) 888 NEXT(); 889 if (!MORE()) { 890 SETERROR(REG_EBRACK); 891 return(0); 892 } 893 len = p->next - sp; 894 for (cp = cnames; cp->name != NULL; cp++) 895 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 896 return(cp->code); /* known name */ 897 memset(&mbs, 0, sizeof(mbs)); 898 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 899 return (wc); /* single character */ 900 else if (clen == (size_t)-1 || clen == (size_t)-2) 901 SETERROR(REG_ILLSEQ); 902 else 903 SETERROR(REG_ECOLLATE); /* neither */ 904 return(0); 905 } 906 907 /* 908 - othercase - return the case counterpart of an alphabetic 909 == static wint_t othercase(wint_t ch); 910 */ 911 static wint_t /* if no counterpart, return ch */ 912 othercase(wint_t ch) 913 { 914 assert(iswalpha(ch)); 915 if (iswupper(ch)) 916 return(towlower(ch)); 917 else if (iswlower(ch)) 918 return(towupper(ch)); 919 else /* peculiar, but could happen */ 920 return(ch); 921 } 922 923 /* 924 - bothcases - emit a dualcase version of a two-case character 925 == static void bothcases(struct parse *p, wint_t ch); 926 * 927 * Boy, is this implementation ever a kludge... 928 */ 929 static void 930 bothcases(struct parse *p, wint_t ch) 931 { 932 char *oldnext = p->next; 933 char *oldend = p->end; 934 char bracket[3 + MB_LEN_MAX]; 935 size_t n; 936 mbstate_t mbs; 937 938 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 939 p->next = bracket; 940 memset(&mbs, 0, sizeof(mbs)); 941 n = wcrtomb(bracket, ch, &mbs); 942 assert(n != (size_t)-1); 943 bracket[n] = ']'; 944 bracket[n + 1] = '\0'; 945 p->end = bracket+n+1; 946 p_bracket(p); 947 assert(p->next == p->end); 948 p->next = oldnext; 949 p->end = oldend; 950 } 951 952 /* 953 - ordinary - emit an ordinary character 954 == static void ordinary(struct parse *p, wint_t ch); 955 */ 956 static void 957 ordinary(struct parse *p, wint_t ch) 958 { 959 cset *cs; 960 961 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 962 bothcases(p, ch); 963 else if ((ch & OPDMASK) == ch) 964 EMIT(OCHAR, ch); 965 else { 966 /* 967 * Kludge: character is too big to fit into an OCHAR operand. 968 * Emit a singleton set. 969 */ 970 if ((cs = allocset(p)) == NULL) 971 return; 972 CHadd(p, cs, ch); 973 EMIT(OANYOF, (int)(cs - p->g->sets)); 974 } 975 } 976 977 /* 978 - nonnewline - emit REG_NEWLINE version of OANY 979 == static void nonnewline(struct parse *p); 980 * 981 * Boy, is this implementation ever a kludge... 982 */ 983 static void 984 nonnewline(struct parse *p) 985 { 986 char *oldnext = p->next; 987 char *oldend = p->end; 988 char bracket[4]; 989 990 p->next = bracket; 991 p->end = bracket+3; 992 bracket[0] = '^'; 993 bracket[1] = '\n'; 994 bracket[2] = ']'; 995 bracket[3] = '\0'; 996 p_bracket(p); 997 assert(p->next == bracket+3); 998 p->next = oldnext; 999 p->end = oldend; 1000 } 1001 1002 /* 1003 - repeat - generate code for a bounded repetition, recursively if needed 1004 == static void repeat(struct parse *p, sopno start, int from, int to); 1005 */ 1006 static void 1007 repeat(struct parse *p, 1008 sopno start, /* operand from here to end of strip */ 1009 int from, /* repeated from this number */ 1010 int to) /* to this number of times (maybe INFINITY) */ 1011 { 1012 sopno finish = HERE(); 1013 # define N 2 1014 # define INF 3 1015 # define REP(f, t) ((f)*8 + (t)) 1016 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1017 sopno copy; 1018 1019 if (p->error != 0) /* head off possible runaway recursion */ 1020 return; 1021 1022 assert(from <= to); 1023 1024 switch (REP(MAP(from), MAP(to))) { 1025 case REP(0, 0): /* must be user doing this */ 1026 DROP(finish-start); /* drop the operand */ 1027 break; 1028 case REP(0, 1): /* as x{1,1}? */ 1029 case REP(0, N): /* as x{1,n}? */ 1030 case REP(0, INF): /* as x{1,}? */ 1031 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1032 INSERT(OCH_, start); /* offset is wrong... */ 1033 repeat(p, start+1, 1, to); 1034 ASTERN(OOR1, start); 1035 AHEAD(start); /* ... fix it */ 1036 EMIT(OOR2, 0); 1037 AHEAD(THERE()); 1038 ASTERN(O_CH, THERETHERE()); 1039 break; 1040 case REP(1, 1): /* trivial case */ 1041 /* done */ 1042 break; 1043 case REP(1, N): /* as x?x{1,n-1} */ 1044 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1045 INSERT(OCH_, start); 1046 ASTERN(OOR1, start); 1047 AHEAD(start); 1048 EMIT(OOR2, 0); /* offset very wrong... */ 1049 AHEAD(THERE()); /* ...so fix it */ 1050 ASTERN(O_CH, THERETHERE()); 1051 copy = dupl(p, start+1, finish+1); 1052 assert(copy == finish+4); 1053 repeat(p, copy, 1, to-1); 1054 break; 1055 case REP(1, INF): /* as x+ */ 1056 INSERT(OPLUS_, start); 1057 ASTERN(O_PLUS, start); 1058 break; 1059 case REP(N, N): /* as xx{m-1,n-1} */ 1060 copy = dupl(p, start, finish); 1061 repeat(p, copy, from-1, to-1); 1062 break; 1063 case REP(N, INF): /* as xx{n-1,INF} */ 1064 copy = dupl(p, start, finish); 1065 repeat(p, copy, from-1, to); 1066 break; 1067 default: /* "can't happen" */ 1068 SETERROR(REG_ASSERT); /* just in case */ 1069 break; 1070 } 1071 } 1072 1073 /* 1074 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1075 - character from the parse struct, signals a REG_ILLSEQ error if the 1076 - character can't be converted. Returns the number of bytes consumed. 1077 */ 1078 static wint_t 1079 wgetnext(struct parse *p) 1080 { 1081 mbstate_t mbs; 1082 wchar_t wc; 1083 size_t n; 1084 1085 memset(&mbs, 0, sizeof(mbs)); 1086 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1087 if (n == (size_t)-1 || n == (size_t)-2) { 1088 SETERROR(REG_ILLSEQ); 1089 return (0); 1090 } 1091 if (n == 0) 1092 n = 1; 1093 p->next += n; 1094 return (wc); 1095 } 1096 1097 /* 1098 - seterr - set an error condition 1099 == static int seterr(struct parse *p, int e); 1100 */ 1101 static int /* useless but makes type checking happy */ 1102 seterr(struct parse *p, int e) 1103 { 1104 if (p->error == 0) /* keep earliest error condition */ 1105 p->error = e; 1106 p->next = nuls; /* try to bring things to a halt */ 1107 p->end = nuls; 1108 return(0); /* make the return value well-defined */ 1109 } 1110 1111 /* 1112 - allocset - allocate a set of characters for [] 1113 == static cset *allocset(struct parse *p); 1114 */ 1115 static cset * 1116 allocset(struct parse *p) 1117 { 1118 cset *cs, *ncs; 1119 1120 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); 1121 if (ncs == NULL) { 1122 SETERROR(REG_ESPACE); 1123 return (NULL); 1124 } 1125 p->g->sets = ncs; 1126 cs = &p->g->sets[p->g->ncsets++]; 1127 memset(cs, 0, sizeof(*cs)); 1128 1129 return(cs); 1130 } 1131 1132 /* 1133 - freeset - free a now-unused set 1134 == static void freeset(struct parse *p, cset *cs); 1135 */ 1136 static void 1137 freeset(struct parse *p, cset *cs) 1138 { 1139 cset *top = &p->g->sets[p->g->ncsets]; 1140 1141 free(cs->wides); 1142 free(cs->ranges); 1143 free(cs->types); 1144 memset(cs, 0, sizeof(*cs)); 1145 if (cs == top-1) /* recover only the easy case */ 1146 p->g->ncsets--; 1147 } 1148 1149 /* 1150 - singleton - Determine whether a set contains only one character, 1151 - returning it if so, otherwise returning OUT. 1152 */ 1153 static wint_t 1154 singleton(cset *cs) 1155 { 1156 wint_t i, s, n; 1157 1158 for (i = n = 0; i < NC; i++) 1159 if (CHIN(cs, i)) { 1160 n++; 1161 s = i; 1162 } 1163 if (n == 1) 1164 return (s); 1165 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1166 cs->icase == 0) 1167 return (cs->wides[0]); 1168 /* Don't bother handling the other cases. */ 1169 return (OUT); 1170 } 1171 1172 /* 1173 - CHadd - add character to character set. 1174 */ 1175 static void 1176 CHadd(struct parse *p, cset *cs, wint_t ch) 1177 { 1178 wint_t nch, *newwides; 1179 assert(ch >= 0); 1180 if (ch < NC) 1181 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1182 else { 1183 newwides = realloc(cs->wides, (cs->nwides + 1) * 1184 sizeof(*cs->wides)); 1185 if (newwides == NULL) { 1186 SETERROR(REG_ESPACE); 1187 return; 1188 } 1189 cs->wides = newwides; 1190 cs->wides[cs->nwides++] = ch; 1191 } 1192 if (cs->icase) { 1193 if ((nch = towlower(ch)) < NC) 1194 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1195 if ((nch = towupper(ch)) < NC) 1196 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1197 } 1198 } 1199 1200 /* 1201 - CHaddrange - add all characters in the range [min,max] to a character set. 1202 */ 1203 static void 1204 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1205 { 1206 crange *newranges; 1207 1208 for (; min < NC && min <= max; min++) 1209 CHadd(p, cs, min); 1210 if (min >= max) 1211 return; 1212 newranges = realloc(cs->ranges, (cs->nranges + 1) * 1213 sizeof(*cs->ranges)); 1214 if (newranges == NULL) { 1215 SETERROR(REG_ESPACE); 1216 return; 1217 } 1218 cs->ranges = newranges; 1219 cs->ranges[cs->nranges].min = min; 1220 cs->ranges[cs->nranges].min = max; 1221 cs->nranges++; 1222 } 1223 1224 /* 1225 - CHaddtype - add all characters of a certain type to a character set. 1226 */ 1227 static void 1228 CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1229 { 1230 wint_t i; 1231 wctype_t *newtypes; 1232 1233 for (i = 0; i < NC; i++) 1234 if (iswctype(i, wct)) 1235 CHadd(p, cs, i); 1236 newtypes = realloc(cs->types, (cs->ntypes + 1) * 1237 sizeof(*cs->types)); 1238 if (newtypes == NULL) { 1239 SETERROR(REG_ESPACE); 1240 return; 1241 } 1242 cs->types = newtypes; 1243 cs->types[cs->ntypes++] = wct; 1244 } 1245 1246 /* 1247 - dupl - emit a duplicate of a bunch of sops 1248 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1249 */ 1250 static sopno /* start of duplicate */ 1251 dupl(struct parse *p, 1252 sopno start, /* from here */ 1253 sopno finish) /* to this less one */ 1254 { 1255 sopno ret = HERE(); 1256 sopno len = finish - start; 1257 1258 assert(finish >= start); 1259 if (len == 0) 1260 return(ret); 1261 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1262 return(ret); 1263 (void) memcpy((char *)(p->strip + p->slen), 1264 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1265 p->slen += len; 1266 return(ret); 1267 } 1268 1269 /* 1270 - doemit - emit a strip operator 1271 == static void doemit(struct parse *p, sop op, size_t opnd); 1272 * 1273 * It might seem better to implement this as a macro with a function as 1274 * hard-case backup, but it's just too big and messy unless there are 1275 * some changes to the data structures. Maybe later. 1276 */ 1277 static void 1278 doemit(struct parse *p, sop op, size_t opnd) 1279 { 1280 /* avoid making error situations worse */ 1281 if (p->error != 0) 1282 return; 1283 1284 /* deal with oversize operands ("can't happen", more or less) */ 1285 assert(opnd < 1<<OPSHIFT); 1286 1287 /* deal with undersized strip */ 1288 if (p->slen >= p->ssize) 1289 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1290 return; 1291 1292 /* finally, it's all reduced to the easy case */ 1293 p->strip[p->slen++] = SOP(op, opnd); 1294 } 1295 1296 /* 1297 - doinsert - insert a sop into the strip 1298 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1299 */ 1300 static void 1301 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1302 { 1303 sopno sn; 1304 sop s; 1305 int i; 1306 1307 /* avoid making error situations worse */ 1308 if (p->error != 0) 1309 return; 1310 1311 sn = HERE(); 1312 EMIT(op, opnd); /* do checks, ensure space */ 1313 assert(HERE() == sn+1); 1314 s = p->strip[sn]; 1315 1316 /* adjust paren pointers */ 1317 assert(pos > 0); 1318 for (i = 1; i < NPAREN; i++) { 1319 if (p->pbegin[i] >= pos) { 1320 p->pbegin[i]++; 1321 } 1322 if (p->pend[i] >= pos) { 1323 p->pend[i]++; 1324 } 1325 } 1326 1327 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1328 (HERE()-pos-1)*sizeof(sop)); 1329 p->strip[pos] = s; 1330 } 1331 1332 /* 1333 - dofwd - complete a forward reference 1334 == static void dofwd(struct parse *p, sopno pos, sop value); 1335 */ 1336 static void 1337 dofwd(struct parse *p, sopno pos, sop value) 1338 { 1339 /* avoid making error situations worse */ 1340 if (p->error != 0) 1341 return; 1342 1343 assert(value < 1<<OPSHIFT); 1344 p->strip[pos] = OP(p->strip[pos]) | value; 1345 } 1346 1347 /* 1348 - enlarge - enlarge the strip 1349 == static int enlarge(struct parse *p, sopno size); 1350 */ 1351 static int 1352 enlarge(struct parse *p, sopno size) 1353 { 1354 sop *sp; 1355 1356 if (p->ssize >= size) 1357 return 1; 1358 1359 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1360 if (sp == NULL) { 1361 SETERROR(REG_ESPACE); 1362 return 0; 1363 } 1364 p->strip = sp; 1365 p->ssize = size; 1366 return 1; 1367 } 1368 1369 /* 1370 - stripsnug - compact the strip 1371 == static void stripsnug(struct parse *p, struct re_guts *g); 1372 */ 1373 static void 1374 stripsnug(struct parse *p, struct re_guts *g) 1375 { 1376 g->nstates = p->slen; 1377 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1378 if (g->strip == NULL) { 1379 SETERROR(REG_ESPACE); 1380 g->strip = p->strip; 1381 } 1382 } 1383 1384 /* 1385 - findmust - fill in must and mlen with longest mandatory literal string 1386 == static void findmust(struct parse *p, struct re_guts *g); 1387 * 1388 * This algorithm could do fancy things like analyzing the operands of | 1389 * for common subsequences. Someday. This code is simple and finds most 1390 * of the interesting cases. 1391 * 1392 * Note that must and mlen got initialized during setup. 1393 */ 1394 static void 1395 findmust(struct parse *p, struct re_guts *g) 1396 { 1397 sop *scan; 1398 sop *start; 1399 sop *newstart; 1400 sopno newlen; 1401 sop s; 1402 char *cp; 1403 int offset; 1404 char buf[MB_LEN_MAX]; 1405 size_t clen; 1406 mbstate_t mbs; 1407 1408 /* avoid making error situations worse */ 1409 if (p->error != 0) 1410 return; 1411 1412 /* 1413 * It's not generally safe to do a ``char'' substring search on 1414 * multibyte character strings, but it's safe for at least 1415 * UTF-8 (see RFC 3629). 1416 */ 1417 if (MB_CUR_MAX > 1 && 1418 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1419 return; 1420 1421 /* find the longest OCHAR sequence in strip */ 1422 newlen = 0; 1423 offset = 0; 1424 g->moffset = 0; 1425 scan = g->strip + 1; 1426 do { 1427 s = *scan++; 1428 switch (OP(s)) { 1429 case OCHAR: /* sequence member */ 1430 if (newlen == 0) { /* new sequence */ 1431 memset(&mbs, 0, sizeof(mbs)); 1432 newstart = scan - 1; 1433 } 1434 clen = wcrtomb(buf, OPND(s), &mbs); 1435 if (clen == (size_t)-1) 1436 goto toohard; 1437 newlen += clen; 1438 break; 1439 case OPLUS_: /* things that don't break one */ 1440 case OLPAREN: 1441 case ORPAREN: 1442 break; 1443 case OQUEST_: /* things that must be skipped */ 1444 case OCH_: 1445 offset = altoffset(scan, offset); 1446 scan--; 1447 do { 1448 scan += OPND(s); 1449 s = *scan; 1450 /* assert() interferes w debug printouts */ 1451 if (OP(s) != O_QUEST && OP(s) != O_CH && 1452 OP(s) != OOR2) { 1453 g->iflags |= BAD; 1454 return; 1455 } 1456 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1457 /* FALLTHROUGH */ 1458 case OBOW: /* things that break a sequence */ 1459 case OEOW: 1460 case OBOL: 1461 case OEOL: 1462 case O_QUEST: 1463 case O_CH: 1464 case OEND: 1465 if (newlen > g->mlen) { /* ends one */ 1466 start = newstart; 1467 g->mlen = newlen; 1468 if (offset > -1) { 1469 g->moffset += offset; 1470 offset = newlen; 1471 } else 1472 g->moffset = offset; 1473 } else { 1474 if (offset > -1) 1475 offset += newlen; 1476 } 1477 newlen = 0; 1478 break; 1479 case OANY: 1480 if (newlen > g->mlen) { /* ends one */ 1481 start = newstart; 1482 g->mlen = newlen; 1483 if (offset > -1) { 1484 g->moffset += offset; 1485 offset = newlen; 1486 } else 1487 g->moffset = offset; 1488 } else { 1489 if (offset > -1) 1490 offset += newlen; 1491 } 1492 if (offset > -1) 1493 offset++; 1494 newlen = 0; 1495 break; 1496 case OANYOF: /* may or may not invalidate offset */ 1497 /* First, everything as OANY */ 1498 if (newlen > g->mlen) { /* ends one */ 1499 start = newstart; 1500 g->mlen = newlen; 1501 if (offset > -1) { 1502 g->moffset += offset; 1503 offset = newlen; 1504 } else 1505 g->moffset = offset; 1506 } else { 1507 if (offset > -1) 1508 offset += newlen; 1509 } 1510 if (offset > -1) 1511 offset++; 1512 newlen = 0; 1513 break; 1514 toohard: 1515 default: 1516 /* Anything here makes it impossible or too hard 1517 * to calculate the offset -- so we give up; 1518 * save the last known good offset, in case the 1519 * must sequence doesn't occur later. 1520 */ 1521 if (newlen > g->mlen) { /* ends one */ 1522 start = newstart; 1523 g->mlen = newlen; 1524 if (offset > -1) 1525 g->moffset += offset; 1526 else 1527 g->moffset = offset; 1528 } 1529 offset = -1; 1530 newlen = 0; 1531 break; 1532 } 1533 } while (OP(s) != OEND); 1534 1535 if (g->mlen == 0) { /* there isn't one */ 1536 g->moffset = -1; 1537 return; 1538 } 1539 1540 /* turn it into a character string */ 1541 g->must = malloc((size_t)g->mlen + 1); 1542 if (g->must == NULL) { /* argh; just forget it */ 1543 g->mlen = 0; 1544 g->moffset = -1; 1545 return; 1546 } 1547 cp = g->must; 1548 scan = start; 1549 memset(&mbs, 0, sizeof(mbs)); 1550 while (cp < g->must + g->mlen) { 1551 while (OP(s = *scan++) != OCHAR) 1552 continue; 1553 clen = wcrtomb(cp, OPND(s), &mbs); 1554 assert(clen != (size_t)-1); 1555 cp += clen; 1556 } 1557 assert(cp == g->must + g->mlen); 1558 *cp++ = '\0'; /* just on general principles */ 1559 } 1560 1561 /* 1562 - altoffset - choose biggest offset among multiple choices 1563 == static int altoffset(sop *scan, int offset); 1564 * 1565 * Compute, recursively if necessary, the largest offset among multiple 1566 * re paths. 1567 */ 1568 static int 1569 altoffset(sop *scan, int offset) 1570 { 1571 int largest; 1572 int try; 1573 sop s; 1574 1575 /* If we gave up already on offsets, return */ 1576 if (offset == -1) 1577 return -1; 1578 1579 largest = 0; 1580 try = 0; 1581 s = *scan++; 1582 while (OP(s) != O_QUEST && OP(s) != O_CH) { 1583 switch (OP(s)) { 1584 case OOR1: 1585 if (try > largest) 1586 largest = try; 1587 try = 0; 1588 break; 1589 case OQUEST_: 1590 case OCH_: 1591 try = altoffset(scan, try); 1592 if (try == -1) 1593 return -1; 1594 scan--; 1595 do { 1596 scan += OPND(s); 1597 s = *scan; 1598 if (OP(s) != O_QUEST && OP(s) != O_CH && 1599 OP(s) != OOR2) 1600 return -1; 1601 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1602 /* We must skip to the next position, or we'll 1603 * leave altoffset() too early. 1604 */ 1605 scan++; 1606 break; 1607 case OANYOF: 1608 case OCHAR: 1609 case OANY: 1610 try++; 1611 case OBOW: 1612 case OEOW: 1613 case OLPAREN: 1614 case ORPAREN: 1615 case OOR2: 1616 break; 1617 default: 1618 try = -1; 1619 break; 1620 } 1621 if (try == -1) 1622 return -1; 1623 s = *scan++; 1624 } 1625 1626 if (try > largest) 1627 largest = try; 1628 1629 return largest+offset; 1630 } 1631 1632 /* 1633 - computejumps - compute char jumps for BM scan 1634 == static void computejumps(struct parse *p, struct re_guts *g); 1635 * 1636 * This algorithm assumes g->must exists and is has size greater than 1637 * zero. It's based on the algorithm found on Computer Algorithms by 1638 * Sara Baase. 1639 * 1640 * A char jump is the number of characters one needs to jump based on 1641 * the value of the character from the text that was mismatched. 1642 */ 1643 static void 1644 computejumps(struct parse *p, struct re_guts *g) 1645 { 1646 int ch; 1647 int mindex; 1648 1649 /* Avoid making errors worse */ 1650 if (p->error != 0) 1651 return; 1652 1653 g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 1654 if (g->charjump == NULL) /* Not a fatal error */ 1655 return; 1656 /* Adjust for signed chars, if necessary */ 1657 g->charjump = &g->charjump[-(CHAR_MIN)]; 1658 1659 /* If the character does not exist in the pattern, the jump 1660 * is equal to the number of characters in the pattern. 1661 */ 1662 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 1663 g->charjump[ch] = g->mlen; 1664 1665 /* If the character does exist, compute the jump that would 1666 * take us to the last character in the pattern equal to it 1667 * (notice that we match right to left, so that last character 1668 * is the first one that would be matched). 1669 */ 1670 for (mindex = 0; mindex < g->mlen; mindex++) 1671 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 1672 } 1673 1674 /* 1675 - computematchjumps - compute match jumps for BM scan 1676 == static void computematchjumps(struct parse *p, struct re_guts *g); 1677 * 1678 * This algorithm assumes g->must exists and is has size greater than 1679 * zero. It's based on the algorithm found on Computer Algorithms by 1680 * Sara Baase. 1681 * 1682 * A match jump is the number of characters one needs to advance based 1683 * on the already-matched suffix. 1684 * Notice that all values here are minus (g->mlen-1), because of the way 1685 * the search algorithm works. 1686 */ 1687 static void 1688 computematchjumps(struct parse *p, struct re_guts *g) 1689 { 1690 int mindex; /* General "must" iterator */ 1691 int suffix; /* Keeps track of matching suffix */ 1692 int ssuffix; /* Keeps track of suffixes' suffix */ 1693 int* pmatches; /* pmatches[k] points to the next i 1694 * such that i+1...mlen is a substring 1695 * of k+1...k+mlen-i-1 1696 */ 1697 1698 /* Avoid making errors worse */ 1699 if (p->error != 0) 1700 return; 1701 1702 pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); 1703 if (pmatches == NULL) { 1704 g->matchjump = NULL; 1705 return; 1706 } 1707 1708 g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); 1709 if (g->matchjump == NULL) /* Not a fatal error */ 1710 return; 1711 1712 /* Set maximum possible jump for each character in the pattern */ 1713 for (mindex = 0; mindex < g->mlen; mindex++) 1714 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 1715 1716 /* Compute pmatches[] */ 1717 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 1718 mindex--, suffix--) { 1719 pmatches[mindex] = suffix; 1720 1721 /* If a mismatch is found, interrupting the substring, 1722 * compute the matchjump for that position. If no 1723 * mismatch is found, then a text substring mismatched 1724 * against the suffix will also mismatch against the 1725 * substring. 1726 */ 1727 while (suffix < g->mlen 1728 && g->must[mindex] != g->must[suffix]) { 1729 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1730 g->mlen - mindex - 1); 1731 suffix = pmatches[suffix]; 1732 } 1733 } 1734 1735 /* Compute the matchjump up to the last substring found to jump 1736 * to the beginning of the largest must pattern prefix matching 1737 * it's own suffix. 1738 */ 1739 for (mindex = 0; mindex <= suffix; mindex++) 1740 g->matchjump[mindex] = MIN(g->matchjump[mindex], 1741 g->mlen + suffix - mindex); 1742 1743 ssuffix = pmatches[suffix]; 1744 while (suffix < g->mlen) { 1745 while (suffix <= ssuffix && suffix < g->mlen) { 1746 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1747 g->mlen + ssuffix - suffix); 1748 suffix++; 1749 } 1750 if (suffix < g->mlen) 1751 ssuffix = pmatches[ssuffix]; 1752 } 1753 1754 free(pmatches); 1755 } 1756 1757 /* 1758 - pluscount - count + nesting 1759 == static sopno pluscount(struct parse *p, struct re_guts *g); 1760 */ 1761 static sopno /* nesting depth */ 1762 pluscount(struct parse *p, struct re_guts *g) 1763 { 1764 sop *scan; 1765 sop s; 1766 sopno plusnest = 0; 1767 sopno maxnest = 0; 1768 1769 if (p->error != 0) 1770 return(0); /* there may not be an OEND */ 1771 1772 scan = g->strip + 1; 1773 do { 1774 s = *scan++; 1775 switch (OP(s)) { 1776 case OPLUS_: 1777 plusnest++; 1778 break; 1779 case O_PLUS: 1780 if (plusnest > maxnest) 1781 maxnest = plusnest; 1782 plusnest--; 1783 break; 1784 } 1785 } while (OP(s) != OEND); 1786 if (plusnest != 0) 1787 g->iflags |= BAD; 1788 return(maxnest); 1789 } 1790