1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 38 */ 39 40 #if defined(LIBC_SCCS) && !defined(lint) 41 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 42 #endif /* LIBC_SCCS and not lint */ 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include <sys/types.h> 47 #include <stdio.h> 48 #include <string.h> 49 #include <ctype.h> 50 #include <limits.h> 51 #include <stdlib.h> 52 #include <regex.h> 53 #include <runetype.h> 54 #include <wchar.h> 55 #include <wctype.h> 56 57 #include "collate.h" 58 59 #include "utils.h" 60 #include "regex2.h" 61 62 #include "cname.h" 63 64 /* 65 * parse structure, passed up and down to avoid global variables and 66 * other clumsinesses 67 */ 68 struct parse { 69 char *next; /* next character in RE */ 70 char *end; /* end of string (-> NUL normally) */ 71 int error; /* has an error been seen? */ 72 sop *strip; /* malloced strip */ 73 sopno ssize; /* malloced strip size (allocated) */ 74 sopno slen; /* malloced strip length (used) */ 75 int ncsalloc; /* number of csets allocated */ 76 struct re_guts *g; 77 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 78 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 79 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 80 }; 81 82 /* ========= begin header generated by ./mkh ========= */ 83 #ifdef __cplusplus 84 extern "C" { 85 #endif 86 87 /* === regcomp.c === */ 88 static void p_ere(struct parse *p, wint_t stop); 89 static void p_ere_exp(struct parse *p); 90 static void p_str(struct parse *p); 91 static void p_bre(struct parse *p, wint_t end1, wint_t end2); 92 static int p_simp_re(struct parse *p, int starordinary); 93 static int p_count(struct parse *p); 94 static void p_bracket(struct parse *p); 95 static void p_b_term(struct parse *p, cset *cs); 96 static void p_b_cclass(struct parse *p, cset *cs); 97 static void p_b_eclass(struct parse *p, cset *cs); 98 static wint_t p_b_symbol(struct parse *p); 99 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 100 static wint_t othercase(wint_t ch); 101 static void bothcases(struct parse *p, wint_t ch); 102 static void ordinary(struct parse *p, wint_t ch); 103 static void nonnewline(struct parse *p); 104 static void repeat(struct parse *p, sopno start, int from, int to); 105 static int seterr(struct parse *p, int e); 106 static cset *allocset(struct parse *p); 107 static void freeset(struct parse *p, cset *cs); 108 static void CHadd(struct parse *p, cset *cs, wint_t ch); 109 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 110 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 111 static wint_t singleton(cset *cs); 112 static sopno dupl(struct parse *p, sopno start, sopno finish); 113 static void doemit(struct parse *p, sop op, size_t opnd); 114 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 115 static void dofwd(struct parse *p, sopno pos, sop value); 116 static void enlarge(struct parse *p, sopno size); 117 static void stripsnug(struct parse *p, struct re_guts *g); 118 static void findmust(struct parse *p, struct re_guts *g); 119 static int altoffset(sop *scan, int offset); 120 static void computejumps(struct parse *p, struct re_guts *g); 121 static void computematchjumps(struct parse *p, struct re_guts *g); 122 static sopno pluscount(struct parse *p, struct re_guts *g); 123 static wint_t wgetnext(struct parse *p); 124 125 #ifdef __cplusplus 126 } 127 #endif 128 /* ========= end header generated by ./mkh ========= */ 129 130 static char nuls[10]; /* place to point scanner in event of error */ 131 132 /* 133 * macros for use with parse structure 134 * BEWARE: these know that the parse structure is named `p' !!! 135 */ 136 #define PEEK() (*p->next) 137 #define PEEK2() (*(p->next+1)) 138 #define MORE() (p->next < p->end) 139 #define MORE2() (p->next+1 < p->end) 140 #define SEE(c) (MORE() && PEEK() == (c)) 141 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 142 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 143 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 144 #define NEXT() (p->next++) 145 #define NEXT2() (p->next += 2) 146 #define NEXTn(n) (p->next += (n)) 147 #define GETNEXT() (*p->next++) 148 #define WGETNEXT() wgetnext(p) 149 #define SETERROR(e) seterr(p, (e)) 150 #define REQUIRE(co, e) ((co) || SETERROR(e)) 151 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 152 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 153 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 154 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 155 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 156 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 157 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 158 #define HERE() (p->slen) 159 #define THERE() (p->slen - 1) 160 #define THERETHERE() (p->slen - 2) 161 #define DROP(n) (p->slen -= (n)) 162 163 #ifndef NDEBUG 164 static int never = 0; /* for use in asserts; shuts lint up */ 165 #else 166 #define never 0 /* some <assert.h>s have bugs too */ 167 #endif 168 169 /* Macro used by computejump()/computematchjump() */ 170 #define MIN(a,b) ((a)<(b)?(a):(b)) 171 172 /* 173 - regcomp - interface for parser and compilation 174 = extern int regcomp(regex_t *, const char *, int); 175 = #define REG_BASIC 0000 176 = #define REG_EXTENDED 0001 177 = #define REG_ICASE 0002 178 = #define REG_NOSUB 0004 179 = #define REG_NEWLINE 0010 180 = #define REG_NOSPEC 0020 181 = #define REG_PEND 0040 182 = #define REG_DUMP 0200 183 */ 184 int /* 0 success, otherwise REG_something */ 185 regcomp(preg, pattern, cflags) 186 regex_t * __restrict preg; 187 const char * __restrict pattern; 188 int cflags; 189 { 190 struct parse pa; 191 struct re_guts *g; 192 struct parse *p = &pa; 193 int i; 194 size_t len; 195 #ifdef REDEBUG 196 # define GOODFLAGS(f) (f) 197 #else 198 # define GOODFLAGS(f) ((f)&~REG_DUMP) 199 #endif 200 201 cflags = GOODFLAGS(cflags); 202 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 203 return(REG_INVARG); 204 205 if (cflags®_PEND) { 206 if (preg->re_endp < pattern) 207 return(REG_INVARG); 208 len = preg->re_endp - pattern; 209 } else 210 len = strlen((char *)pattern); 211 212 /* do the mallocs early so failure handling is easy */ 213 g = (struct re_guts *)malloc(sizeof(struct re_guts)); 214 if (g == NULL) 215 return(REG_ESPACE); 216 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 217 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 218 p->slen = 0; 219 if (p->strip == NULL) { 220 free((char *)g); 221 return(REG_ESPACE); 222 } 223 224 /* set things up */ 225 p->g = g; 226 p->next = (char *)pattern; /* convenience; we do not modify it */ 227 p->end = p->next + len; 228 p->error = 0; 229 p->ncsalloc = 0; 230 for (i = 0; i < NPAREN; i++) { 231 p->pbegin[i] = 0; 232 p->pend[i] = 0; 233 } 234 g->sets = NULL; 235 g->ncsets = 0; 236 g->cflags = cflags; 237 g->iflags = 0; 238 g->nbol = 0; 239 g->neol = 0; 240 g->must = NULL; 241 g->moffset = -1; 242 g->charjump = NULL; 243 g->matchjump = NULL; 244 g->mlen = 0; 245 g->nsub = 0; 246 g->backrefs = 0; 247 248 /* do it */ 249 EMIT(OEND, 0); 250 g->firststate = THERE(); 251 if (cflags®_EXTENDED) 252 p_ere(p, OUT); 253 else if (cflags®_NOSPEC) 254 p_str(p); 255 else 256 p_bre(p, OUT, OUT); 257 EMIT(OEND, 0); 258 g->laststate = THERE(); 259 260 /* tidy up loose ends and fill things in */ 261 stripsnug(p, g); 262 findmust(p, g); 263 /* only use Boyer-Moore algorithm if the pattern is bigger 264 * than three characters 265 */ 266 if(g->mlen > 3) { 267 computejumps(p, g); 268 computematchjumps(p, g); 269 if(g->matchjump == NULL && g->charjump != NULL) { 270 free(g->charjump); 271 g->charjump = NULL; 272 } 273 } 274 g->nplus = pluscount(p, g); 275 g->magic = MAGIC2; 276 preg->re_nsub = g->nsub; 277 preg->re_g = g; 278 preg->re_magic = MAGIC1; 279 #ifndef REDEBUG 280 /* not debugging, so can't rely on the assert() in regexec() */ 281 if (g->iflags&BAD) 282 SETERROR(REG_ASSERT); 283 #endif 284 285 /* win or lose, we're done */ 286 if (p->error != 0) /* lose */ 287 regfree(preg); 288 return(p->error); 289 } 290 291 /* 292 - p_ere - ERE parser top level, concatenation and alternation 293 == static void p_ere(struct parse *p, int stop); 294 */ 295 static void 296 p_ere(p, stop) 297 struct parse *p; 298 int stop; /* character this ERE should end at */ 299 { 300 char c; 301 sopno prevback; 302 sopno prevfwd; 303 sopno conc; 304 int first = 1; /* is this the first alternative? */ 305 306 for (;;) { 307 /* do a bunch of concatenated expressions */ 308 conc = HERE(); 309 while (MORE() && (c = PEEK()) != '|' && c != stop) 310 p_ere_exp(p); 311 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 312 313 if (!EAT('|')) 314 break; /* NOTE BREAK OUT */ 315 316 if (first) { 317 INSERT(OCH_, conc); /* offset is wrong */ 318 prevfwd = conc; 319 prevback = conc; 320 first = 0; 321 } 322 ASTERN(OOR1, prevback); 323 prevback = THERE(); 324 AHEAD(prevfwd); /* fix previous offset */ 325 prevfwd = HERE(); 326 EMIT(OOR2, 0); /* offset is very wrong */ 327 } 328 329 if (!first) { /* tail-end fixups */ 330 AHEAD(prevfwd); 331 ASTERN(O_CH, prevback); 332 } 333 334 assert(!MORE() || SEE(stop)); 335 } 336 337 /* 338 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 339 == static void p_ere_exp(struct parse *p); 340 */ 341 static void 342 p_ere_exp(p) 343 struct parse *p; 344 { 345 char c; 346 wint_t wc; 347 sopno pos; 348 int count; 349 int count2; 350 sopno subno; 351 int wascaret = 0; 352 353 assert(MORE()); /* caller should have ensured this */ 354 c = GETNEXT(); 355 356 pos = HERE(); 357 switch (c) { 358 case '(': 359 (void)REQUIRE(MORE(), REG_EPAREN); 360 p->g->nsub++; 361 subno = p->g->nsub; 362 if (subno < NPAREN) 363 p->pbegin[subno] = HERE(); 364 EMIT(OLPAREN, subno); 365 if (!SEE(')')) 366 p_ere(p, ')'); 367 if (subno < NPAREN) { 368 p->pend[subno] = HERE(); 369 assert(p->pend[subno] != 0); 370 } 371 EMIT(ORPAREN, subno); 372 (void)MUSTEAT(')', REG_EPAREN); 373 break; 374 #ifndef POSIX_MISTAKE 375 case ')': /* happens only if no current unmatched ( */ 376 /* 377 * You may ask, why the ifndef? Because I didn't notice 378 * this until slightly too late for 1003.2, and none of the 379 * other 1003.2 regular-expression reviewers noticed it at 380 * all. So an unmatched ) is legal POSIX, at least until 381 * we can get it fixed. 382 */ 383 SETERROR(REG_EPAREN); 384 break; 385 #endif 386 case '^': 387 EMIT(OBOL, 0); 388 p->g->iflags |= USEBOL; 389 p->g->nbol++; 390 wascaret = 1; 391 break; 392 case '$': 393 EMIT(OEOL, 0); 394 p->g->iflags |= USEEOL; 395 p->g->neol++; 396 break; 397 case '|': 398 SETERROR(REG_EMPTY); 399 break; 400 case '*': 401 case '+': 402 case '?': 403 SETERROR(REG_BADRPT); 404 break; 405 case '.': 406 if (p->g->cflags®_NEWLINE) 407 nonnewline(p); 408 else 409 EMIT(OANY, 0); 410 break; 411 case '[': 412 p_bracket(p); 413 break; 414 case '\\': 415 (void)REQUIRE(MORE(), REG_EESCAPE); 416 wc = WGETNEXT(); 417 ordinary(p, wc); 418 break; 419 case '{': /* okay as ordinary except if digit follows */ 420 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 421 /* FALLTHROUGH */ 422 default: 423 p->next--; 424 wc = WGETNEXT(); 425 ordinary(p, wc); 426 break; 427 } 428 429 if (!MORE()) 430 return; 431 c = PEEK(); 432 /* we call { a repetition if followed by a digit */ 433 if (!( c == '*' || c == '+' || c == '?' || 434 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 435 return; /* no repetition, we're done */ 436 NEXT(); 437 438 (void)REQUIRE(!wascaret, REG_BADRPT); 439 switch (c) { 440 case '*': /* implemented as +? */ 441 /* this case does not require the (y|) trick, noKLUDGE */ 442 INSERT(OPLUS_, pos); 443 ASTERN(O_PLUS, pos); 444 INSERT(OQUEST_, pos); 445 ASTERN(O_QUEST, pos); 446 break; 447 case '+': 448 INSERT(OPLUS_, pos); 449 ASTERN(O_PLUS, pos); 450 break; 451 case '?': 452 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 453 INSERT(OCH_, pos); /* offset slightly wrong */ 454 ASTERN(OOR1, pos); /* this one's right */ 455 AHEAD(pos); /* fix the OCH_ */ 456 EMIT(OOR2, 0); /* offset very wrong... */ 457 AHEAD(THERE()); /* ...so fix it */ 458 ASTERN(O_CH, THERETHERE()); 459 break; 460 case '{': 461 count = p_count(p); 462 if (EAT(',')) { 463 if (isdigit((uch)PEEK())) { 464 count2 = p_count(p); 465 (void)REQUIRE(count <= count2, REG_BADBR); 466 } else /* single number with comma */ 467 count2 = INFINITY; 468 } else /* just a single number */ 469 count2 = count; 470 repeat(p, pos, count, count2); 471 if (!EAT('}')) { /* error heuristics */ 472 while (MORE() && PEEK() != '}') 473 NEXT(); 474 (void)REQUIRE(MORE(), REG_EBRACE); 475 SETERROR(REG_BADBR); 476 } 477 break; 478 } 479 480 if (!MORE()) 481 return; 482 c = PEEK(); 483 if (!( c == '*' || c == '+' || c == '?' || 484 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 485 return; 486 SETERROR(REG_BADRPT); 487 } 488 489 /* 490 - p_str - string (no metacharacters) "parser" 491 == static void p_str(struct parse *p); 492 */ 493 static void 494 p_str(p) 495 struct parse *p; 496 { 497 (void)REQUIRE(MORE(), REG_EMPTY); 498 while (MORE()) 499 ordinary(p, WGETNEXT()); 500 } 501 502 /* 503 - p_bre - BRE parser top level, anchoring and concatenation 504 == static void p_bre(struct parse *p, int end1, \ 505 == int end2); 506 * Giving end1 as OUT essentially eliminates the end1/end2 check. 507 * 508 * This implementation is a bit of a kludge, in that a trailing $ is first 509 * taken as an ordinary character and then revised to be an anchor. 510 * The amount of lookahead needed to avoid this kludge is excessive. 511 */ 512 static void 513 p_bre(p, end1, end2) 514 struct parse *p; 515 int end1; /* first terminating character */ 516 int end2; /* second terminating character */ 517 { 518 sopno start = HERE(); 519 int first = 1; /* first subexpression? */ 520 int wasdollar = 0; 521 522 if (EAT('^')) { 523 EMIT(OBOL, 0); 524 p->g->iflags |= USEBOL; 525 p->g->nbol++; 526 } 527 while (MORE() && !SEETWO(end1, end2)) { 528 wasdollar = p_simp_re(p, first); 529 first = 0; 530 } 531 if (wasdollar) { /* oops, that was a trailing anchor */ 532 DROP(1); 533 EMIT(OEOL, 0); 534 p->g->iflags |= USEEOL; 535 p->g->neol++; 536 } 537 538 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 539 } 540 541 /* 542 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 543 == static int p_simp_re(struct parse *p, int starordinary); 544 */ 545 static int /* was the simple RE an unbackslashed $? */ 546 p_simp_re(p, starordinary) 547 struct parse *p; 548 int starordinary; /* is a leading * an ordinary character? */ 549 { 550 int c; 551 int count; 552 int count2; 553 sopno pos; 554 int i; 555 wint_t wc; 556 sopno subno; 557 # define BACKSL (1<<CHAR_BIT) 558 559 pos = HERE(); /* repetion op, if any, covers from here */ 560 561 assert(MORE()); /* caller should have ensured this */ 562 c = GETNEXT(); 563 if (c == '\\') { 564 (void)REQUIRE(MORE(), REG_EESCAPE); 565 c = BACKSL | GETNEXT(); 566 } 567 switch (c) { 568 case '.': 569 if (p->g->cflags®_NEWLINE) 570 nonnewline(p); 571 else 572 EMIT(OANY, 0); 573 break; 574 case '[': 575 p_bracket(p); 576 break; 577 case BACKSL|'{': 578 SETERROR(REG_BADRPT); 579 break; 580 case BACKSL|'(': 581 p->g->nsub++; 582 subno = p->g->nsub; 583 if (subno < NPAREN) 584 p->pbegin[subno] = HERE(); 585 EMIT(OLPAREN, subno); 586 /* the MORE here is an error heuristic */ 587 if (MORE() && !SEETWO('\\', ')')) 588 p_bre(p, '\\', ')'); 589 if (subno < NPAREN) { 590 p->pend[subno] = HERE(); 591 assert(p->pend[subno] != 0); 592 } 593 EMIT(ORPAREN, subno); 594 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 595 break; 596 case BACKSL|')': /* should not get here -- must be user */ 597 case BACKSL|'}': 598 SETERROR(REG_EPAREN); 599 break; 600 case BACKSL|'1': 601 case BACKSL|'2': 602 case BACKSL|'3': 603 case BACKSL|'4': 604 case BACKSL|'5': 605 case BACKSL|'6': 606 case BACKSL|'7': 607 case BACKSL|'8': 608 case BACKSL|'9': 609 i = (c&~BACKSL) - '0'; 610 assert(i < NPAREN); 611 if (p->pend[i] != 0) { 612 assert(i <= p->g->nsub); 613 EMIT(OBACK_, i); 614 assert(p->pbegin[i] != 0); 615 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 616 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 617 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 618 EMIT(O_BACK, i); 619 } else 620 SETERROR(REG_ESUBREG); 621 p->g->backrefs = 1; 622 break; 623 case '*': 624 (void)REQUIRE(starordinary, REG_BADRPT); 625 /* FALLTHROUGH */ 626 default: 627 p->next--; 628 wc = WGETNEXT(); 629 ordinary(p, wc); 630 break; 631 } 632 633 if (EAT('*')) { /* implemented as +? */ 634 /* this case does not require the (y|) trick, noKLUDGE */ 635 INSERT(OPLUS_, pos); 636 ASTERN(O_PLUS, pos); 637 INSERT(OQUEST_, pos); 638 ASTERN(O_QUEST, pos); 639 } else if (EATTWO('\\', '{')) { 640 count = p_count(p); 641 if (EAT(',')) { 642 if (MORE() && isdigit((uch)PEEK())) { 643 count2 = p_count(p); 644 (void)REQUIRE(count <= count2, REG_BADBR); 645 } else /* single number with comma */ 646 count2 = INFINITY; 647 } else /* just a single number */ 648 count2 = count; 649 repeat(p, pos, count, count2); 650 if (!EATTWO('\\', '}')) { /* error heuristics */ 651 while (MORE() && !SEETWO('\\', '}')) 652 NEXT(); 653 (void)REQUIRE(MORE(), REG_EBRACE); 654 SETERROR(REG_BADBR); 655 } 656 } else if (c == '$') /* $ (but not \$) ends it */ 657 return(1); 658 659 return(0); 660 } 661 662 /* 663 - p_count - parse a repetition count 664 == static int p_count(struct parse *p); 665 */ 666 static int /* the value */ 667 p_count(p) 668 struct parse *p; 669 { 670 int count = 0; 671 int ndigits = 0; 672 673 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 674 count = count*10 + (GETNEXT() - '0'); 675 ndigits++; 676 } 677 678 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 679 return(count); 680 } 681 682 /* 683 - p_bracket - parse a bracketed character list 684 == static void p_bracket(struct parse *p); 685 */ 686 static void 687 p_bracket(p) 688 struct parse *p; 689 { 690 cset *cs; 691 wint_t ch; 692 693 /* Dept of Truly Sickening Special-Case Kludges */ 694 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 695 EMIT(OBOW, 0); 696 NEXTn(6); 697 return; 698 } 699 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 700 EMIT(OEOW, 0); 701 NEXTn(6); 702 return; 703 } 704 705 if ((cs = allocset(p)) == NULL) 706 return; 707 708 if (p->g->cflags®_ICASE) 709 cs->icase = 1; 710 if (EAT('^')) 711 cs->invert = 1; 712 if (EAT(']')) 713 CHadd(p, cs, ']'); 714 else if (EAT('-')) 715 CHadd(p, cs, '-'); 716 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 717 p_b_term(p, cs); 718 if (EAT('-')) 719 CHadd(p, cs, '-'); 720 (void)MUSTEAT(']', REG_EBRACK); 721 722 if (p->error != 0) /* don't mess things up further */ 723 return; 724 725 if (cs->invert && p->g->cflags®_NEWLINE) 726 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 727 728 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 729 ordinary(p, ch); 730 freeset(p, cs); 731 } else 732 EMIT(OANYOF, (int)(cs - p->g->sets)); 733 } 734 735 /* 736 - p_b_term - parse one term of a bracketed character list 737 == static void p_b_term(struct parse *p, cset *cs); 738 */ 739 static void 740 p_b_term(p, cs) 741 struct parse *p; 742 cset *cs; 743 { 744 char c; 745 wint_t start, finish; 746 wint_t i; 747 748 /* classify what we've got */ 749 switch ((MORE()) ? PEEK() : '\0') { 750 case '[': 751 c = (MORE2()) ? PEEK2() : '\0'; 752 break; 753 case '-': 754 SETERROR(REG_ERANGE); 755 return; /* NOTE RETURN */ 756 break; 757 default: 758 c = '\0'; 759 break; 760 } 761 762 switch (c) { 763 case ':': /* character class */ 764 NEXT2(); 765 (void)REQUIRE(MORE(), REG_EBRACK); 766 c = PEEK(); 767 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 768 p_b_cclass(p, cs); 769 (void)REQUIRE(MORE(), REG_EBRACK); 770 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 771 break; 772 case '=': /* equivalence class */ 773 NEXT2(); 774 (void)REQUIRE(MORE(), REG_EBRACK); 775 c = PEEK(); 776 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 777 p_b_eclass(p, cs); 778 (void)REQUIRE(MORE(), REG_EBRACK); 779 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 780 break; 781 default: /* symbol, ordinary character, or range */ 782 start = p_b_symbol(p); 783 if (SEE('-') && MORE2() && PEEK2() != ']') { 784 /* range */ 785 NEXT(); 786 if (EAT('-')) 787 finish = '-'; 788 else 789 finish = p_b_symbol(p); 790 } else 791 finish = start; 792 if (start == finish) 793 CHadd(p, cs, start); 794 else { 795 if (__collate_load_error) { 796 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 797 CHaddrange(p, cs, start, finish); 798 } else { 799 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); 800 for (i = 0; i <= UCHAR_MAX; i++) { 801 if ( __collate_range_cmp(start, i) <= 0 802 && __collate_range_cmp(i, finish) <= 0 803 ) 804 CHadd(p, cs, i); 805 } 806 } 807 } 808 break; 809 } 810 } 811 812 /* 813 - p_b_cclass - parse a character-class name and deal with it 814 == static void p_b_cclass(struct parse *p, cset *cs); 815 */ 816 static void 817 p_b_cclass(p, cs) 818 struct parse *p; 819 cset *cs; 820 { 821 char *sp = p->next; 822 size_t len; 823 wctype_t wct; 824 char clname[16]; 825 826 while (MORE() && isalpha((uch)PEEK())) 827 NEXT(); 828 len = p->next - sp; 829 if (len >= sizeof(clname) - 1) { 830 SETERROR(REG_ECTYPE); 831 return; 832 } 833 memcpy(clname, sp, len); 834 clname[len] = '\0'; 835 if ((wct = wctype(clname)) == 0) { 836 SETERROR(REG_ECTYPE); 837 return; 838 } 839 CHaddtype(p, cs, wct); 840 } 841 842 /* 843 - p_b_eclass - parse an equivalence-class name and deal with it 844 == static void p_b_eclass(struct parse *p, cset *cs); 845 * 846 * This implementation is incomplete. xxx 847 */ 848 static void 849 p_b_eclass(p, cs) 850 struct parse *p; 851 cset *cs; 852 { 853 wint_t c; 854 855 c = p_b_coll_elem(p, '='); 856 CHadd(p, cs, c); 857 } 858 859 /* 860 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 861 == static char p_b_symbol(struct parse *p); 862 */ 863 static wint_t /* value of symbol */ 864 p_b_symbol(p) 865 struct parse *p; 866 { 867 wint_t value; 868 869 (void)REQUIRE(MORE(), REG_EBRACK); 870 if (!EATTWO('[', '.')) 871 return(WGETNEXT()); 872 873 /* collating symbol */ 874 value = p_b_coll_elem(p, '.'); 875 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 876 return(value); 877 } 878 879 /* 880 - p_b_coll_elem - parse a collating-element name and look it up 881 == static char p_b_coll_elem(struct parse *p, int endc); 882 */ 883 static wint_t /* value of collating element */ 884 p_b_coll_elem(p, endc) 885 struct parse *p; 886 wint_t endc; /* name ended by endc,']' */ 887 { 888 char *sp = p->next; 889 struct cname *cp; 890 int len; 891 mbstate_t mbs; 892 wchar_t wc; 893 size_t clen; 894 895 while (MORE() && !SEETWO(endc, ']')) 896 NEXT(); 897 if (!MORE()) { 898 SETERROR(REG_EBRACK); 899 return(0); 900 } 901 len = p->next - sp; 902 for (cp = cnames; cp->name != NULL; cp++) 903 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 904 return(cp->code); /* known name */ 905 memset(&mbs, 0, sizeof(mbs)); 906 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 907 return (wc); /* single character */ 908 else if (clen == (size_t)-1 || clen == (size_t)-2) 909 SETERROR(REG_ILLSEQ); 910 else 911 SETERROR(REG_ECOLLATE); /* neither */ 912 return(0); 913 } 914 915 /* 916 - othercase - return the case counterpart of an alphabetic 917 == static char othercase(int ch); 918 */ 919 static wint_t /* if no counterpart, return ch */ 920 othercase(ch) 921 wint_t ch; 922 { 923 assert(iswalpha(ch)); 924 if (iswupper(ch)) 925 return(towlower(ch)); 926 else if (iswlower(ch)) 927 return(towupper(ch)); 928 else /* peculiar, but could happen */ 929 return(ch); 930 } 931 932 /* 933 - bothcases - emit a dualcase version of a two-case character 934 == static void bothcases(struct parse *p, int ch); 935 * 936 * Boy, is this implementation ever a kludge... 937 */ 938 static void 939 bothcases(p, ch) 940 struct parse *p; 941 wint_t ch; 942 { 943 char *oldnext = p->next; 944 char *oldend = p->end; 945 char bracket[3 + MB_LEN_MAX]; 946 size_t n; 947 mbstate_t mbs; 948 949 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 950 p->next = bracket; 951 memset(&mbs, 0, sizeof(mbs)); 952 n = wcrtomb(bracket, ch, &mbs); 953 assert(n != (size_t)-1); 954 bracket[n] = ']'; 955 bracket[n + 1] = '\0'; 956 p->end = bracket+n+1; 957 p_bracket(p); 958 assert(p->next == p->end); 959 p->next = oldnext; 960 p->end = oldend; 961 } 962 963 /* 964 - ordinary - emit an ordinary character 965 == static void ordinary(struct parse *p, int ch); 966 */ 967 static void 968 ordinary(p, ch) 969 struct parse *p; 970 wint_t ch; 971 { 972 cset *cs; 973 974 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 975 bothcases(p, ch); 976 else if ((ch & OPDMASK) == ch) 977 EMIT(OCHAR, ch); 978 else { 979 /* 980 * Kludge: character is too big to fit into an OCHAR operand. 981 * Emit a singleton set. 982 */ 983 if ((cs = allocset(p)) == NULL) 984 return; 985 CHadd(p, cs, ch); 986 EMIT(OANYOF, (int)(cs - p->g->sets)); 987 } 988 } 989 990 /* 991 - nonnewline - emit REG_NEWLINE version of OANY 992 == static void nonnewline(struct parse *p); 993 * 994 * Boy, is this implementation ever a kludge... 995 */ 996 static void 997 nonnewline(p) 998 struct parse *p; 999 { 1000 char *oldnext = p->next; 1001 char *oldend = p->end; 1002 char bracket[4]; 1003 1004 p->next = bracket; 1005 p->end = bracket+3; 1006 bracket[0] = '^'; 1007 bracket[1] = '\n'; 1008 bracket[2] = ']'; 1009 bracket[3] = '\0'; 1010 p_bracket(p); 1011 assert(p->next == bracket+3); 1012 p->next = oldnext; 1013 p->end = oldend; 1014 } 1015 1016 /* 1017 - repeat - generate code for a bounded repetition, recursively if needed 1018 == static void repeat(struct parse *p, sopno start, int from, int to); 1019 */ 1020 static void 1021 repeat(p, start, from, to) 1022 struct parse *p; 1023 sopno start; /* operand from here to end of strip */ 1024 int from; /* repeated from this number */ 1025 int to; /* to this number of times (maybe INFINITY) */ 1026 { 1027 sopno finish = HERE(); 1028 # define N 2 1029 # define INF 3 1030 # define REP(f, t) ((f)*8 + (t)) 1031 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1032 sopno copy; 1033 1034 if (p->error != 0) /* head off possible runaway recursion */ 1035 return; 1036 1037 assert(from <= to); 1038 1039 switch (REP(MAP(from), MAP(to))) { 1040 case REP(0, 0): /* must be user doing this */ 1041 DROP(finish-start); /* drop the operand */ 1042 break; 1043 case REP(0, 1): /* as x{1,1}? */ 1044 case REP(0, N): /* as x{1,n}? */ 1045 case REP(0, INF): /* as x{1,}? */ 1046 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1047 INSERT(OCH_, start); /* offset is wrong... */ 1048 repeat(p, start+1, 1, to); 1049 ASTERN(OOR1, start); 1050 AHEAD(start); /* ... fix it */ 1051 EMIT(OOR2, 0); 1052 AHEAD(THERE()); 1053 ASTERN(O_CH, THERETHERE()); 1054 break; 1055 case REP(1, 1): /* trivial case */ 1056 /* done */ 1057 break; 1058 case REP(1, N): /* as x?x{1,n-1} */ 1059 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1060 INSERT(OCH_, start); 1061 ASTERN(OOR1, start); 1062 AHEAD(start); 1063 EMIT(OOR2, 0); /* offset very wrong... */ 1064 AHEAD(THERE()); /* ...so fix it */ 1065 ASTERN(O_CH, THERETHERE()); 1066 copy = dupl(p, start+1, finish+1); 1067 assert(copy == finish+4); 1068 repeat(p, copy, 1, to-1); 1069 break; 1070 case REP(1, INF): /* as x+ */ 1071 INSERT(OPLUS_, start); 1072 ASTERN(O_PLUS, start); 1073 break; 1074 case REP(N, N): /* as xx{m-1,n-1} */ 1075 copy = dupl(p, start, finish); 1076 repeat(p, copy, from-1, to-1); 1077 break; 1078 case REP(N, INF): /* as xx{n-1,INF} */ 1079 copy = dupl(p, start, finish); 1080 repeat(p, copy, from-1, to); 1081 break; 1082 default: /* "can't happen" */ 1083 SETERROR(REG_ASSERT); /* just in case */ 1084 break; 1085 } 1086 } 1087 1088 /* 1089 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1090 - character from the parse struct, signals a REG_ILLSEQ error if the 1091 - character can't be converted. Returns the number of bytes consumed. 1092 */ 1093 static wint_t 1094 wgetnext(p) 1095 struct parse *p; 1096 { 1097 mbstate_t mbs; 1098 wchar_t wc; 1099 size_t n; 1100 1101 memset(&mbs, 0, sizeof(mbs)); 1102 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 1103 if (n == (size_t)-1 || n == (size_t)-2) { 1104 SETERROR(REG_ILLSEQ); 1105 return (0); 1106 } 1107 if (n == 0) 1108 n = 1; 1109 p->next += n; 1110 return (wc); 1111 } 1112 1113 /* 1114 - seterr - set an error condition 1115 == static int seterr(struct parse *p, int e); 1116 */ 1117 static int /* useless but makes type checking happy */ 1118 seterr(p, e) 1119 struct parse *p; 1120 int e; 1121 { 1122 if (p->error == 0) /* keep earliest error condition */ 1123 p->error = e; 1124 p->next = nuls; /* try to bring things to a halt */ 1125 p->end = nuls; 1126 return(0); /* make the return value well-defined */ 1127 } 1128 1129 /* 1130 - allocset - allocate a set of characters for [] 1131 == static cset *allocset(struct parse *p); 1132 */ 1133 static cset * 1134 allocset(p) 1135 struct parse *p; 1136 { 1137 cset *cs, *ncs; 1138 1139 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); 1140 if (ncs == NULL) { 1141 SETERROR(REG_ESPACE); 1142 return (NULL); 1143 } 1144 p->g->sets = ncs; 1145 cs = &p->g->sets[p->g->ncsets++]; 1146 memset(cs, 0, sizeof(*cs)); 1147 1148 return(cs); 1149 } 1150 1151 /* 1152 - freeset - free a now-unused set 1153 == static void freeset(struct parse *p, cset *cs); 1154 */ 1155 static void 1156 freeset(p, cs) 1157 struct parse *p; 1158 cset *cs; 1159 { 1160 cset *top = &p->g->sets[p->g->ncsets]; 1161 1162 free(cs->wides); 1163 free(cs->ranges); 1164 free(cs->types); 1165 memset(cs, 0, sizeof(*cs)); 1166 if (cs == top-1) /* recover only the easy case */ 1167 p->g->ncsets--; 1168 } 1169 1170 /* 1171 - singleton - Determine whether a set contains only one character, 1172 - returning it if so, otherwise returning OUT. 1173 */ 1174 static wint_t 1175 singleton(cs) 1176 cset *cs; 1177 { 1178 wint_t i, s, n; 1179 1180 for (i = n = 0; i < NC; i++) 1181 if (CHIN(cs, i)) { 1182 n++; 1183 s = i; 1184 } 1185 if (n == 1) 1186 return (s); 1187 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1188 cs->icase == 0) 1189 return (cs->wides[0]); 1190 /* Don't bother handling the other cases. */ 1191 return (OUT); 1192 } 1193 1194 /* 1195 - CHadd - add character to character set. 1196 */ 1197 static void 1198 CHadd(p, cs, ch) 1199 struct parse *p; 1200 cset *cs; 1201 wint_t ch; 1202 { 1203 wint_t nch, *newwides; 1204 assert(ch >= 0); 1205 if (ch < NC) 1206 cs->bmp[ch >> 3] |= 1 << (ch & 7); 1207 else { 1208 newwides = realloc(cs->wides, (cs->nwides + 1) * 1209 sizeof(*cs->wides)); 1210 if (newwides == NULL) { 1211 SETERROR(REG_ESPACE); 1212 return; 1213 } 1214 cs->wides = newwides; 1215 cs->wides[cs->nwides++] = ch; 1216 } 1217 if (cs->icase) { 1218 if ((nch = towlower(ch)) < NC) 1219 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1220 if ((nch = towupper(ch)) < NC) 1221 cs->bmp[nch >> 3] |= 1 << (nch & 7); 1222 } 1223 } 1224 1225 /* 1226 - CHaddrange - add all characters in the range [min,max] to a character set. 1227 */ 1228 static void 1229 CHaddrange(p, cs, min, max) 1230 struct parse *p; 1231 cset *cs; 1232 wint_t min, max; 1233 { 1234 crange *newranges; 1235 1236 for (; min < NC && min <= max; min++) 1237 CHadd(p, cs, min); 1238 if (min >= max) 1239 return; 1240 newranges = realloc(cs->ranges, (cs->nranges + 1) * 1241 sizeof(*cs->ranges)); 1242 if (newranges == NULL) { 1243 SETERROR(REG_ESPACE); 1244 return; 1245 } 1246 cs->ranges = newranges; 1247 cs->ranges[cs->nranges].min = min; 1248 cs->ranges[cs->nranges].min = max; 1249 cs->nranges++; 1250 } 1251 1252 /* 1253 - CHaddtype - add all characters of a certain type to a character set. 1254 */ 1255 static void 1256 CHaddtype(p, cs, wct) 1257 struct parse *p; 1258 cset *cs; 1259 wctype_t wct; 1260 { 1261 wint_t i; 1262 wctype_t *newtypes; 1263 1264 for (i = 0; i < NC; i++) 1265 if (iswctype(i, wct)) 1266 CHadd(p, cs, i); 1267 newtypes = realloc(cs->types, (cs->ntypes + 1) * 1268 sizeof(*cs->types)); 1269 if (newtypes == NULL) { 1270 SETERROR(REG_ESPACE); 1271 return; 1272 } 1273 cs->types = newtypes; 1274 cs->types[cs->ntypes++] = wct; 1275 } 1276 1277 /* 1278 - dupl - emit a duplicate of a bunch of sops 1279 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1280 */ 1281 static sopno /* start of duplicate */ 1282 dupl(p, start, finish) 1283 struct parse *p; 1284 sopno start; /* from here */ 1285 sopno finish; /* to this less one */ 1286 { 1287 sopno ret = HERE(); 1288 sopno len = finish - start; 1289 1290 assert(finish >= start); 1291 if (len == 0) 1292 return(ret); 1293 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1294 assert(p->ssize >= p->slen + len); 1295 (void) memcpy((char *)(p->strip + p->slen), 1296 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1297 p->slen += len; 1298 return(ret); 1299 } 1300 1301 /* 1302 - doemit - emit a strip operator 1303 == static void doemit(struct parse *p, sop op, size_t opnd); 1304 * 1305 * It might seem better to implement this as a macro with a function as 1306 * hard-case backup, but it's just too big and messy unless there are 1307 * some changes to the data structures. Maybe later. 1308 */ 1309 static void 1310 doemit(p, op, opnd) 1311 struct parse *p; 1312 sop op; 1313 size_t opnd; 1314 { 1315 /* avoid making error situations worse */ 1316 if (p->error != 0) 1317 return; 1318 1319 /* deal with oversize operands ("can't happen", more or less) */ 1320 assert(opnd < 1<<OPSHIFT); 1321 1322 /* deal with undersized strip */ 1323 if (p->slen >= p->ssize) 1324 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1325 assert(p->slen < p->ssize); 1326 1327 /* finally, it's all reduced to the easy case */ 1328 p->strip[p->slen++] = SOP(op, opnd); 1329 } 1330 1331 /* 1332 - doinsert - insert a sop into the strip 1333 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1334 */ 1335 static void 1336 doinsert(p, op, opnd, pos) 1337 struct parse *p; 1338 sop op; 1339 size_t opnd; 1340 sopno pos; 1341 { 1342 sopno sn; 1343 sop s; 1344 int i; 1345 1346 /* avoid making error situations worse */ 1347 if (p->error != 0) 1348 return; 1349 1350 sn = HERE(); 1351 EMIT(op, opnd); /* do checks, ensure space */ 1352 assert(HERE() == sn+1); 1353 s = p->strip[sn]; 1354 1355 /* adjust paren pointers */ 1356 assert(pos > 0); 1357 for (i = 1; i < NPAREN; i++) { 1358 if (p->pbegin[i] >= pos) { 1359 p->pbegin[i]++; 1360 } 1361 if (p->pend[i] >= pos) { 1362 p->pend[i]++; 1363 } 1364 } 1365 1366 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1367 (HERE()-pos-1)*sizeof(sop)); 1368 p->strip[pos] = s; 1369 } 1370 1371 /* 1372 - dofwd - complete a forward reference 1373 == static void dofwd(struct parse *p, sopno pos, sop value); 1374 */ 1375 static void 1376 dofwd(p, pos, value) 1377 struct parse *p; 1378 sopno pos; 1379 sop value; 1380 { 1381 /* avoid making error situations worse */ 1382 if (p->error != 0) 1383 return; 1384 1385 assert(value < 1<<OPSHIFT); 1386 p->strip[pos] = OP(p->strip[pos]) | value; 1387 } 1388 1389 /* 1390 - enlarge - enlarge the strip 1391 == static void enlarge(struct parse *p, sopno size); 1392 */ 1393 static void 1394 enlarge(p, size) 1395 struct parse *p; 1396 sopno size; 1397 { 1398 sop *sp; 1399 1400 if (p->ssize >= size) 1401 return; 1402 1403 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1404 if (sp == NULL) { 1405 SETERROR(REG_ESPACE); 1406 return; 1407 } 1408 p->strip = sp; 1409 p->ssize = size; 1410 } 1411 1412 /* 1413 - stripsnug - compact the strip 1414 == static void stripsnug(struct parse *p, struct re_guts *g); 1415 */ 1416 static void 1417 stripsnug(p, g) 1418 struct parse *p; 1419 struct re_guts *g; 1420 { 1421 g->nstates = p->slen; 1422 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1423 if (g->strip == NULL) { 1424 SETERROR(REG_ESPACE); 1425 g->strip = p->strip; 1426 } 1427 } 1428 1429 /* 1430 - findmust - fill in must and mlen with longest mandatory literal string 1431 == static void findmust(struct parse *p, struct re_guts *g); 1432 * 1433 * This algorithm could do fancy things like analyzing the operands of | 1434 * for common subsequences. Someday. This code is simple and finds most 1435 * of the interesting cases. 1436 * 1437 * Note that must and mlen got initialized during setup. 1438 */ 1439 static void 1440 findmust(p, g) 1441 struct parse *p; 1442 struct re_guts *g; 1443 { 1444 sop *scan; 1445 sop *start; 1446 sop *newstart; 1447 sopno newlen; 1448 sop s; 1449 char *cp; 1450 int offset; 1451 char buf[MB_LEN_MAX]; 1452 size_t clen; 1453 mbstate_t mbs; 1454 1455 /* avoid making error situations worse */ 1456 if (p->error != 0) 1457 return; 1458 1459 /* 1460 * It's not generally safe to do a ``char'' substring search on 1461 * multibyte character strings, but it's safe for at least 1462 * UTF-8 (see RFC 3629). 1463 */ 1464 if (MB_CUR_MAX > 1 && 1465 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 1466 return; 1467 1468 /* find the longest OCHAR sequence in strip */ 1469 newlen = 0; 1470 offset = 0; 1471 g->moffset = 0; 1472 scan = g->strip + 1; 1473 do { 1474 s = *scan++; 1475 switch (OP(s)) { 1476 case OCHAR: /* sequence member */ 1477 if (newlen == 0) { /* new sequence */ 1478 memset(&mbs, 0, sizeof(mbs)); 1479 newstart = scan - 1; 1480 } 1481 clen = wcrtomb(buf, OPND(s), &mbs); 1482 if (clen == (size_t)-1) 1483 goto toohard; 1484 newlen += clen; 1485 break; 1486 case OPLUS_: /* things that don't break one */ 1487 case OLPAREN: 1488 case ORPAREN: 1489 break; 1490 case OQUEST_: /* things that must be skipped */ 1491 case OCH_: 1492 offset = altoffset(scan, offset); 1493 scan--; 1494 do { 1495 scan += OPND(s); 1496 s = *scan; 1497 /* assert() interferes w debug printouts */ 1498 if (OP(s) != O_QUEST && OP(s) != O_CH && 1499 OP(s) != OOR2) { 1500 g->iflags |= BAD; 1501 return; 1502 } 1503 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1504 /* FALLTHROUGH */ 1505 case OBOW: /* things that break a sequence */ 1506 case OEOW: 1507 case OBOL: 1508 case OEOL: 1509 case O_QUEST: 1510 case O_CH: 1511 case OEND: 1512 if (newlen > g->mlen) { /* ends one */ 1513 start = newstart; 1514 g->mlen = newlen; 1515 if (offset > -1) { 1516 g->moffset += offset; 1517 offset = newlen; 1518 } else 1519 g->moffset = offset; 1520 } else { 1521 if (offset > -1) 1522 offset += newlen; 1523 } 1524 newlen = 0; 1525 break; 1526 case OANY: 1527 if (newlen > g->mlen) { /* ends one */ 1528 start = newstart; 1529 g->mlen = newlen; 1530 if (offset > -1) { 1531 g->moffset += offset; 1532 offset = newlen; 1533 } else 1534 g->moffset = offset; 1535 } else { 1536 if (offset > -1) 1537 offset += newlen; 1538 } 1539 if (offset > -1) 1540 offset++; 1541 newlen = 0; 1542 break; 1543 case OANYOF: /* may or may not invalidate offset */ 1544 /* First, everything as OANY */ 1545 if (newlen > g->mlen) { /* ends one */ 1546 start = newstart; 1547 g->mlen = newlen; 1548 if (offset > -1) { 1549 g->moffset += offset; 1550 offset = newlen; 1551 } else 1552 g->moffset = offset; 1553 } else { 1554 if (offset > -1) 1555 offset += newlen; 1556 } 1557 if (offset > -1) 1558 offset++; 1559 newlen = 0; 1560 break; 1561 toohard: 1562 default: 1563 /* Anything here makes it impossible or too hard 1564 * to calculate the offset -- so we give up; 1565 * save the last known good offset, in case the 1566 * must sequence doesn't occur later. 1567 */ 1568 if (newlen > g->mlen) { /* ends one */ 1569 start = newstart; 1570 g->mlen = newlen; 1571 if (offset > -1) 1572 g->moffset += offset; 1573 else 1574 g->moffset = offset; 1575 } 1576 offset = -1; 1577 newlen = 0; 1578 break; 1579 } 1580 } while (OP(s) != OEND); 1581 1582 if (g->mlen == 0) { /* there isn't one */ 1583 g->moffset = -1; 1584 return; 1585 } 1586 1587 /* turn it into a character string */ 1588 g->must = malloc((size_t)g->mlen + 1); 1589 if (g->must == NULL) { /* argh; just forget it */ 1590 g->mlen = 0; 1591 g->moffset = -1; 1592 return; 1593 } 1594 cp = g->must; 1595 scan = start; 1596 memset(&mbs, 0, sizeof(mbs)); 1597 while (cp < g->must + g->mlen) { 1598 while (OP(s = *scan++) != OCHAR) 1599 continue; 1600 clen = wcrtomb(cp, OPND(s), &mbs); 1601 assert(clen != (size_t)-1); 1602 cp += clen; 1603 } 1604 assert(cp == g->must + g->mlen); 1605 *cp++ = '\0'; /* just on general principles */ 1606 } 1607 1608 /* 1609 - altoffset - choose biggest offset among multiple choices 1610 == static int altoffset(sop *scan, int offset); 1611 * 1612 * Compute, recursively if necessary, the largest offset among multiple 1613 * re paths. 1614 */ 1615 static int 1616 altoffset(scan, offset) 1617 sop *scan; 1618 int offset; 1619 { 1620 int largest; 1621 int try; 1622 sop s; 1623 1624 /* If we gave up already on offsets, return */ 1625 if (offset == -1) 1626 return -1; 1627 1628 largest = 0; 1629 try = 0; 1630 s = *scan++; 1631 while (OP(s) != O_QUEST && OP(s) != O_CH) { 1632 switch (OP(s)) { 1633 case OOR1: 1634 if (try > largest) 1635 largest = try; 1636 try = 0; 1637 break; 1638 case OQUEST_: 1639 case OCH_: 1640 try = altoffset(scan, try); 1641 if (try == -1) 1642 return -1; 1643 scan--; 1644 do { 1645 scan += OPND(s); 1646 s = *scan; 1647 if (OP(s) != O_QUEST && OP(s) != O_CH && 1648 OP(s) != OOR2) 1649 return -1; 1650 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1651 /* We must skip to the next position, or we'll 1652 * leave altoffset() too early. 1653 */ 1654 scan++; 1655 break; 1656 case OANYOF: 1657 case OCHAR: 1658 case OANY: 1659 try++; 1660 case OBOW: 1661 case OEOW: 1662 case OLPAREN: 1663 case ORPAREN: 1664 case OOR2: 1665 break; 1666 default: 1667 try = -1; 1668 break; 1669 } 1670 if (try == -1) 1671 return -1; 1672 s = *scan++; 1673 } 1674 1675 if (try > largest) 1676 largest = try; 1677 1678 return largest+offset; 1679 } 1680 1681 /* 1682 - computejumps - compute char jumps for BM scan 1683 == static void computejumps(struct parse *p, struct re_guts *g); 1684 * 1685 * This algorithm assumes g->must exists and is has size greater than 1686 * zero. It's based on the algorithm found on Computer Algorithms by 1687 * Sara Baase. 1688 * 1689 * A char jump is the number of characters one needs to jump based on 1690 * the value of the character from the text that was mismatched. 1691 */ 1692 static void 1693 computejumps(p, g) 1694 struct parse *p; 1695 struct re_guts *g; 1696 { 1697 int ch; 1698 int mindex; 1699 1700 /* Avoid making errors worse */ 1701 if (p->error != 0) 1702 return; 1703 1704 g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 1705 if (g->charjump == NULL) /* Not a fatal error */ 1706 return; 1707 /* Adjust for signed chars, if necessary */ 1708 g->charjump = &g->charjump[-(CHAR_MIN)]; 1709 1710 /* If the character does not exist in the pattern, the jump 1711 * is equal to the number of characters in the pattern. 1712 */ 1713 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 1714 g->charjump[ch] = g->mlen; 1715 1716 /* If the character does exist, compute the jump that would 1717 * take us to the last character in the pattern equal to it 1718 * (notice that we match right to left, so that last character 1719 * is the first one that would be matched). 1720 */ 1721 for (mindex = 0; mindex < g->mlen; mindex++) 1722 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 1723 } 1724 1725 /* 1726 - computematchjumps - compute match jumps for BM scan 1727 == static void computematchjumps(struct parse *p, struct re_guts *g); 1728 * 1729 * This algorithm assumes g->must exists and is has size greater than 1730 * zero. It's based on the algorithm found on Computer Algorithms by 1731 * Sara Baase. 1732 * 1733 * A match jump is the number of characters one needs to advance based 1734 * on the already-matched suffix. 1735 * Notice that all values here are minus (g->mlen-1), because of the way 1736 * the search algorithm works. 1737 */ 1738 static void 1739 computematchjumps(p, g) 1740 struct parse *p; 1741 struct re_guts *g; 1742 { 1743 int mindex; /* General "must" iterator */ 1744 int suffix; /* Keeps track of matching suffix */ 1745 int ssuffix; /* Keeps track of suffixes' suffix */ 1746 int* pmatches; /* pmatches[k] points to the next i 1747 * such that i+1...mlen is a substring 1748 * of k+1...k+mlen-i-1 1749 */ 1750 1751 /* Avoid making errors worse */ 1752 if (p->error != 0) 1753 return; 1754 1755 pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); 1756 if (pmatches == NULL) { 1757 g->matchjump = NULL; 1758 return; 1759 } 1760 1761 g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); 1762 if (g->matchjump == NULL) /* Not a fatal error */ 1763 return; 1764 1765 /* Set maximum possible jump for each character in the pattern */ 1766 for (mindex = 0; mindex < g->mlen; mindex++) 1767 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 1768 1769 /* Compute pmatches[] */ 1770 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 1771 mindex--, suffix--) { 1772 pmatches[mindex] = suffix; 1773 1774 /* If a mismatch is found, interrupting the substring, 1775 * compute the matchjump for that position. If no 1776 * mismatch is found, then a text substring mismatched 1777 * against the suffix will also mismatch against the 1778 * substring. 1779 */ 1780 while (suffix < g->mlen 1781 && g->must[mindex] != g->must[suffix]) { 1782 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1783 g->mlen - mindex - 1); 1784 suffix = pmatches[suffix]; 1785 } 1786 } 1787 1788 /* Compute the matchjump up to the last substring found to jump 1789 * to the beginning of the largest must pattern prefix matching 1790 * it's own suffix. 1791 */ 1792 for (mindex = 0; mindex <= suffix; mindex++) 1793 g->matchjump[mindex] = MIN(g->matchjump[mindex], 1794 g->mlen + suffix - mindex); 1795 1796 ssuffix = pmatches[suffix]; 1797 while (suffix < g->mlen) { 1798 while (suffix <= ssuffix && suffix < g->mlen) { 1799 g->matchjump[suffix] = MIN(g->matchjump[suffix], 1800 g->mlen + ssuffix - suffix); 1801 suffix++; 1802 } 1803 if (suffix < g->mlen) 1804 ssuffix = pmatches[ssuffix]; 1805 } 1806 1807 free(pmatches); 1808 } 1809 1810 /* 1811 - pluscount - count + nesting 1812 == static sopno pluscount(struct parse *p, struct re_guts *g); 1813 */ 1814 static sopno /* nesting depth */ 1815 pluscount(p, g) 1816 struct parse *p; 1817 struct re_guts *g; 1818 { 1819 sop *scan; 1820 sop s; 1821 sopno plusnest = 0; 1822 sopno maxnest = 0; 1823 1824 if (p->error != 0) 1825 return(0); /* there may not be an OEND */ 1826 1827 scan = g->strip + 1; 1828 do { 1829 s = *scan++; 1830 switch (OP(s)) { 1831 case OPLUS_: 1832 plusnest++; 1833 break; 1834 case O_PLUS: 1835 if (plusnest > maxnest) 1836 maxnest = plusnest; 1837 plusnest--; 1838 break; 1839 } 1840 } while (OP(s) != OEND); 1841 if (plusnest != 0) 1842 g->iflags |= BAD; 1843 return(maxnest); 1844 } 1845