1 /*- 2 * This code is derived from OpenBSD's libc/regex, original license follows: 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 36 */ 37 38 #include "regex_impl.h" 39 #include <ctype.h> 40 #include <limits.h> 41 #include <stdint.h> 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <sys/types.h> 46 47 #include "regex2.h" 48 #include "regutils.h" 49 50 #include "llvm/Config/config.h" 51 #include "llvm/Support/Compiler.h" 52 53 /* character-class table */ 54 static struct cclass { 55 const char *name; 56 const char *chars; 57 const char *multis; 58 } cclasses[] = { 59 {"alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ 60 0123456789", 61 ""}, 62 {"alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", ""}, 63 {"blank", " \t", ""}, 64 {"cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\ 65 \25\26\27\30\31\32\33\34\35\36\37\177", 66 ""}, 67 {"digit", "0123456789", ""}, 68 {"graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ 69 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", 70 ""}, 71 {"lower", "abcdefghijklmnopqrstuvwxyz", ""}, 72 {"print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ 73 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", 74 ""}, 75 {"punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", ""}, 76 {"space", "\t\n\v\f\r ", ""}, 77 {"upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", ""}, 78 {"xdigit", "0123456789ABCDEFabcdef", ""}, 79 {NULL, 0, ""}}; 80 81 /* character-name table */ 82 static struct cname { 83 const char *name; 84 char code; 85 } cnames[] = {{"NUL", '\0'}, 86 {"SOH", '\001'}, 87 {"STX", '\002'}, 88 {"ETX", '\003'}, 89 {"EOT", '\004'}, 90 {"ENQ", '\005'}, 91 {"ACK", '\006'}, 92 {"BEL", '\007'}, 93 {"alert", '\007'}, 94 {"BS", '\010'}, 95 {"backspace", '\b'}, 96 {"HT", '\011'}, 97 {"tab", '\t'}, 98 {"LF", '\012'}, 99 {"newline", '\n'}, 100 {"VT", '\013'}, 101 {"vertical-tab", '\v'}, 102 {"FF", '\014'}, 103 {"form-feed", '\f'}, 104 {"CR", '\015'}, 105 {"carriage-return", '\r'}, 106 {"SO", '\016'}, 107 {"SI", '\017'}, 108 {"DLE", '\020'}, 109 {"DC1", '\021'}, 110 {"DC2", '\022'}, 111 {"DC3", '\023'}, 112 {"DC4", '\024'}, 113 {"NAK", '\025'}, 114 {"SYN", '\026'}, 115 {"ETB", '\027'}, 116 {"CAN", '\030'}, 117 {"EM", '\031'}, 118 {"SUB", '\032'}, 119 {"ESC", '\033'}, 120 {"IS4", '\034'}, 121 {"FS", '\034'}, 122 {"IS3", '\035'}, 123 {"GS", '\035'}, 124 {"IS2", '\036'}, 125 {"RS", '\036'}, 126 {"IS1", '\037'}, 127 {"US", '\037'}, 128 {"space", ' '}, 129 {"exclamation-mark", '!'}, 130 {"quotation-mark", '"'}, 131 {"number-sign", '#'}, 132 {"dollar-sign", '$'}, 133 {"percent-sign", '%'}, 134 {"ampersand", '&'}, 135 {"apostrophe", '\''}, 136 {"left-parenthesis", '('}, 137 {"right-parenthesis", ')'}, 138 {"asterisk", '*'}, 139 {"plus-sign", '+'}, 140 {"comma", ','}, 141 {"hyphen", '-'}, 142 {"hyphen-minus", '-'}, 143 {"period", '.'}, 144 {"full-stop", '.'}, 145 {"slash", '/'}, 146 {"solidus", '/'}, 147 {"zero", '0'}, 148 {"one", '1'}, 149 {"two", '2'}, 150 {"three", '3'}, 151 {"four", '4'}, 152 {"five", '5'}, 153 {"six", '6'}, 154 {"seven", '7'}, 155 {"eight", '8'}, 156 {"nine", '9'}, 157 {"colon", ':'}, 158 {"semicolon", ';'}, 159 {"less-than-sign", '<'}, 160 {"equals-sign", '='}, 161 {"greater-than-sign", '>'}, 162 {"question-mark", '?'}, 163 {"commercial-at", '@'}, 164 {"left-square-bracket", '['}, 165 {"backslash", '\\'}, 166 {"reverse-solidus", '\\'}, 167 {"right-square-bracket", ']'}, 168 {"circumflex", '^'}, 169 {"circumflex-accent", '^'}, 170 {"underscore", '_'}, 171 {"low-line", '_'}, 172 {"grave-accent", '`'}, 173 {"left-brace", '{'}, 174 {"left-curly-bracket", '{'}, 175 {"vertical-line", '|'}, 176 {"right-brace", '}'}, 177 {"right-curly-bracket", '}'}, 178 {"tilde", '~'}, 179 {"DEL", '\177'}, 180 {NULL, 0}}; 181 182 /* 183 * parse structure, passed up and down to avoid global variables and 184 * other clumsinesses 185 */ 186 struct parse { 187 const char *next; /* next character in RE */ 188 const char *end; /* end of string (-> NUL normally) */ 189 int error; /* has an error been seen? */ 190 sop *strip; /* malloced strip */ 191 sopno ssize; /* malloced strip size (allocated) */ 192 sopno slen; /* malloced strip length (used) */ 193 int ncsalloc; /* number of csets allocated */ 194 struct re_guts *g; 195 #define NPAREN 10 /* we need to remember () 1-9 for back refs */ 196 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 197 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 198 }; 199 200 static void p_ere(struct parse *, int); 201 static void p_ere_exp(struct parse *); 202 static void p_str(struct parse *); 203 static void p_bre(struct parse *, int, int); 204 static int p_simp_re(struct parse *, int); 205 static int p_count(struct parse *); 206 static void p_bracket(struct parse *); 207 static void p_b_term(struct parse *, cset *); 208 static void p_b_cclass(struct parse *, cset *); 209 static void p_b_eclass(struct parse *, cset *); 210 static char p_b_symbol(struct parse *); 211 static char p_b_coll_elem(struct parse *, int); 212 static char othercase(int); 213 static void bothcases(struct parse *, int); 214 static void ordinary(struct parse *, int); 215 static void nonnewline(struct parse *); 216 static void repeat(struct parse *, sopno, int, int); 217 static int seterr(struct parse *, int); 218 static cset *allocset(struct parse *); 219 static void freeset(struct parse *, cset *); 220 static int freezeset(struct parse *, cset *); 221 static int firstch(struct parse *, cset *); 222 static int nch(struct parse *, cset *); 223 static void mcadd(struct parse *, cset *, const char *); 224 static void mcinvert(struct parse *, cset *); 225 static void mccase(struct parse *, cset *); 226 static int isinsets(struct re_guts *, int); 227 static int samesets(struct re_guts *, int, int); 228 static void categorize(struct parse *, struct re_guts *); 229 static sopno dupl(struct parse *, sopno, sopno); 230 static void doemit(struct parse *, sop, size_t); 231 static void doinsert(struct parse *, sop, size_t, sopno); 232 static void dofwd(struct parse *, sopno, sop); 233 static void enlarge(struct parse *, sopno); 234 static void stripsnug(struct parse *, struct re_guts *); 235 static void findmust(struct parse *, struct re_guts *); 236 static sopno pluscount(struct parse *, struct re_guts *); 237 238 static char nuls[10]; /* place to point scanner in event of error */ 239 240 /* 241 * macros for use with parse structure 242 * BEWARE: these know that the parse structure is named `p' !!! 243 */ 244 #define PEEK() (*p->next) 245 #define PEEK2() (*(p->next + 1)) 246 #define MORE() (p->end - p->next > 0) 247 #define MORE2() (p->end - p->next > 1) 248 #define SEE(c) (MORE() && PEEK() == (c)) 249 #define SEETWO(a, b) (MORE2() && PEEK() == (a) && PEEK2() == (b)) 250 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 251 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 252 #define NEXT() (p->next++) 253 #define NEXT2() (p->next += 2) 254 #define NEXTn(n) (p->next += (n)) 255 #define GETNEXT() (*p->next++) 256 #define SETERROR(e) seterr(p, (e)) 257 #define REQUIRE(co, e) (void)((co) || SETERROR(e)) 258 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 259 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 260 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 261 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 262 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE() - (pos) + 1, pos) 263 #define AHEAD(pos) dofwd(p, pos, HERE() - (pos)) 264 #define ASTERN(sop, pos) EMIT(sop, HERE() - pos) 265 #define HERE() (p->slen) 266 #define THERE() (p->slen - 1) 267 #define THERETHERE() (p->slen - 2) 268 #define DROP(n) (p->slen -= (n)) 269 270 #ifdef _POSIX2_RE_DUP_MAX 271 #define DUPMAX _POSIX2_RE_DUP_MAX 272 #else 273 #define DUPMAX 255 274 #endif 275 #define REGINFINITY (DUPMAX + 1) 276 277 #ifndef NDEBUG 278 static int never = 0; /* for use in asserts; shuts lint up */ 279 #else 280 #define never 0 /* some <assert.h>s have bugs too */ 281 #endif 282 283 /* 284 - llvm_regcomp - interface for parser and compilation 285 */ 286 int /* 0 success, otherwise REG_something */ 287 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) { 288 struct parse pa; 289 struct re_guts *g; 290 struct parse *p = &pa; 291 int i; 292 size_t len; 293 #ifdef REDEBUG 294 #define GOODFLAGS(f) (f) 295 #else 296 #define GOODFLAGS(f) ((f) & ~REG_DUMP) 297 #endif 298 299 cflags = GOODFLAGS(cflags); 300 if ((cflags & REG_EXTENDED) && (cflags & REG_NOSPEC)) 301 return (REG_INVARG); 302 303 if (cflags & REG_PEND) { 304 if (preg->re_endp < pattern) 305 return (REG_INVARG); 306 len = preg->re_endp - pattern; 307 } else { 308 len = strlen((const char *)pattern); 309 } 310 311 /* do the mallocs early so failure handling is easy */ 312 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 313 (NC - 1) * sizeof(cat_t)); 314 if (g == NULL) 315 return (REG_ESPACE); 316 p->ssize = len / (size_t)2 * (size_t)3 + (size_t)1; /* ugh */ 317 p->strip = (sop *)calloc(p->ssize, sizeof(sop)); 318 p->slen = 0; 319 if (p->strip == NULL) { 320 free((char *)g); 321 return (REG_ESPACE); 322 } 323 324 /* set things up */ 325 p->g = g; 326 p->next = pattern; 327 p->end = p->next + len; 328 p->error = 0; 329 p->ncsalloc = 0; 330 for (i = 0; i < NPAREN; i++) { 331 p->pbegin[i] = 0; 332 p->pend[i] = 0; 333 } 334 g->csetsize = NC; 335 g->sets = NULL; 336 g->setbits = NULL; 337 g->ncsets = 0; 338 g->cflags = cflags; 339 g->iflags = 0; 340 g->nbol = 0; 341 g->neol = 0; 342 g->must = NULL; 343 g->mlen = 0; 344 g->nsub = 0; 345 g->ncategories = 1; /* category 0 is "everything else" */ 346 g->categories = &g->catspace[-(CHAR_MIN)]; 347 (void)memset((char *)g->catspace, 0, NC * sizeof(cat_t)); 348 g->backrefs = 0; 349 350 /* do it */ 351 EMIT(OEND, 0); 352 g->firststate = THERE(); 353 if (cflags & REG_EXTENDED) 354 p_ere(p, OUT); 355 else if (cflags & REG_NOSPEC) 356 p_str(p); 357 else 358 p_bre(p, OUT, OUT); 359 EMIT(OEND, 0); 360 g->laststate = THERE(); 361 362 /* tidy up loose ends and fill things in */ 363 categorize(p, g); 364 stripsnug(p, g); 365 findmust(p, g); 366 g->nplus = pluscount(p, g); 367 g->magic = MAGIC2; 368 preg->re_nsub = g->nsub; 369 preg->re_g = g; 370 preg->re_magic = MAGIC1; 371 #ifndef REDEBUG 372 /* not debugging, so can't rely on the assert() in llvm_regexec() */ 373 if (g->iflags & REGEX_BAD) 374 SETERROR(REG_ASSERT); 375 #endif 376 377 /* win or lose, we're done */ 378 if (p->error != 0) /* lose */ 379 llvm_regfree(preg); 380 return (p->error); 381 } 382 383 /* 384 - p_ere - ERE parser top level, concatenation and alternation 385 */ 386 static void p_ere(struct parse *p, 387 int stop) /* character this ERE should end at */ 388 { 389 char c; 390 sopno prevback = 0; 391 sopno prevfwd = 0; 392 sopno conc; 393 int first = 1; /* is this the first alternative? */ 394 395 for (;;) { 396 /* do a bunch of concatenated expressions */ 397 conc = HERE(); 398 while (MORE() && (c = PEEK()) != '|' && c != stop) 399 p_ere_exp(p); 400 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 401 402 if (!EAT('|')) 403 break; /* NOTE BREAK OUT */ 404 405 if (first) { 406 INSERT(OCH_, conc); /* offset is wrong */ 407 prevfwd = conc; 408 prevback = conc; 409 first = 0; 410 } 411 ASTERN(OOR1, prevback); 412 prevback = THERE(); 413 AHEAD(prevfwd); /* fix previous offset */ 414 prevfwd = HERE(); 415 EMIT(OOR2, 0); /* offset is very wrong */ 416 } 417 418 if (!first) { /* tail-end fixups */ 419 AHEAD(prevfwd); 420 ASTERN(O_CH, prevback); 421 } 422 423 assert(!MORE() || SEE(stop)); 424 } 425 426 /* 427 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 428 */ 429 static void p_ere_exp(struct parse *p) { 430 char c; 431 sopno pos; 432 int count; 433 int count2; 434 int backrefnum; 435 sopno subno; 436 int wascaret = 0; 437 438 assert(MORE()); /* caller should have ensured this */ 439 c = GETNEXT(); 440 441 pos = HERE(); 442 switch (c) { 443 case '(': 444 REQUIRE(MORE(), REG_EPAREN); 445 p->g->nsub++; 446 subno = p->g->nsub; 447 if (subno < NPAREN) 448 p->pbegin[subno] = HERE(); 449 EMIT(OLPAREN, subno); 450 if (!SEE(')')) 451 p_ere(p, ')'); 452 if (subno < NPAREN) { 453 p->pend[subno] = HERE(); 454 assert(p->pend[subno] != 0); 455 } 456 EMIT(ORPAREN, subno); 457 MUSTEAT(')', REG_EPAREN); 458 break; 459 #ifndef POSIX_MISTAKE 460 case ')': /* happens only if no current unmatched ( */ 461 /* 462 * You may ask, why the ifndef? Because I didn't notice 463 * this until slightly too late for 1003.2, and none of the 464 * other 1003.2 regular-expression reviewers noticed it at 465 * all. So an unmatched ) is legal POSIX, at least until 466 * we can get it fixed. 467 */ 468 SETERROR(REG_EPAREN); 469 break; 470 #endif 471 case '^': 472 EMIT(OBOL, 0); 473 p->g->iflags |= USEBOL; 474 p->g->nbol++; 475 wascaret = 1; 476 break; 477 case '$': 478 EMIT(OEOL, 0); 479 p->g->iflags |= USEEOL; 480 p->g->neol++; 481 break; 482 case '|': 483 SETERROR(REG_EMPTY); 484 break; 485 case '*': 486 case '+': 487 case '?': 488 SETERROR(REG_BADRPT); 489 break; 490 case '.': 491 if (p->g->cflags & REG_NEWLINE) 492 nonnewline(p); 493 else 494 EMIT(OANY, 0); 495 break; 496 case '[': 497 p_bracket(p); 498 break; 499 case '\\': 500 REQUIRE(MORE(), REG_EESCAPE); 501 c = GETNEXT(); 502 if (c >= '1' && c <= '9') { 503 /* \[0-9] is taken to be a back-reference to a previously specified 504 * matching group. backrefnum will hold the number. The matching 505 * group must exist (i.e. if \4 is found there must have been at 506 * least 4 matching groups specified in the pattern previously). 507 */ 508 backrefnum = c - '0'; 509 if (p->pend[backrefnum] == 0) { 510 SETERROR(REG_ESUBREG); 511 break; 512 } 513 514 /* Make sure everything checks out and emit the sequence 515 * that marks a back-reference to the parse structure. 516 */ 517 assert(backrefnum <= p->g->nsub); 518 EMIT(OBACK_, backrefnum); 519 assert(p->pbegin[backrefnum] != 0); 520 assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN); 521 assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN); 522 (void)dupl(p, p->pbegin[backrefnum] + 1, p->pend[backrefnum]); 523 EMIT(O_BACK, backrefnum); 524 p->g->backrefs = 1; 525 } else { 526 /* Other chars are simply themselves when escaped with a backslash. 527 */ 528 ordinary(p, c); 529 } 530 break; 531 case '{': /* okay as ordinary except if digit follows */ 532 REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 533 LLVM_FALLTHROUGH; 534 default: 535 ordinary(p, c); 536 break; 537 } 538 539 if (!MORE()) 540 return; 541 c = PEEK(); 542 /* we call { a repetition if followed by a digit */ 543 if (!(c == '*' || c == '+' || c == '?' || 544 (c == '{' && MORE2() && isdigit((uch)PEEK2())))) 545 return; /* no repetition, we're done */ 546 NEXT(); 547 548 REQUIRE(!wascaret, REG_BADRPT); 549 switch (c) { 550 case '*': /* implemented as +? */ 551 /* this case does not require the (y|) trick, noKLUDGE */ 552 INSERT(OPLUS_, pos); 553 ASTERN(O_PLUS, pos); 554 INSERT(OQUEST_, pos); 555 ASTERN(O_QUEST, pos); 556 break; 557 case '+': 558 INSERT(OPLUS_, pos); 559 ASTERN(O_PLUS, pos); 560 break; 561 case '?': 562 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 563 INSERT(OCH_, pos); /* offset slightly wrong */ 564 ASTERN(OOR1, pos); /* this one's right */ 565 AHEAD(pos); /* fix the OCH_ */ 566 EMIT(OOR2, 0); /* offset very wrong... */ 567 AHEAD(THERE()); /* ...so fix it */ 568 ASTERN(O_CH, THERETHERE()); 569 break; 570 case '{': 571 count = p_count(p); 572 if (EAT(',')) { 573 if (isdigit((uch)PEEK())) { 574 count2 = p_count(p); 575 REQUIRE(count <= count2, REG_BADBR); 576 } else /* single number with comma */ 577 count2 = REGINFINITY; 578 } else /* just a single number */ 579 count2 = count; 580 repeat(p, pos, count, count2); 581 if (!EAT('}')) { /* error heuristics */ 582 while (MORE() && PEEK() != '}') 583 NEXT(); 584 REQUIRE(MORE(), REG_EBRACE); 585 SETERROR(REG_BADBR); 586 } 587 break; 588 } 589 590 if (!MORE()) 591 return; 592 c = PEEK(); 593 if (!(c == '*' || c == '+' || c == '?' || 594 (c == '{' && MORE2() && isdigit((uch)PEEK2())))) 595 return; 596 SETERROR(REG_BADRPT); 597 } 598 599 /* 600 - p_str - string (no metacharacters) "parser" 601 */ 602 static void p_str(struct parse *p) { 603 REQUIRE(MORE(), REG_EMPTY); 604 while (MORE()) 605 ordinary(p, GETNEXT()); 606 } 607 608 /* 609 - p_bre - BRE parser top level, anchoring and concatenation 610 * Giving end1 as OUT essentially eliminates the end1/end2 check. 611 * 612 * This implementation is a bit of a kludge, in that a trailing $ is first 613 * taken as an ordinary character and then revised to be an anchor. The 614 * only undesirable side effect is that '$' gets included as a character 615 * category in such cases. This is fairly harmless; not worth fixing. 616 * The amount of lookahead needed to avoid this kludge is excessive. 617 */ 618 static void p_bre(struct parse *p, int end1, /* first terminating character */ 619 int end2) /* second terminating character */ 620 { 621 sopno start = HERE(); 622 int first = 1; /* first subexpression? */ 623 int wasdollar = 0; 624 625 if (EAT('^')) { 626 EMIT(OBOL, 0); 627 p->g->iflags |= USEBOL; 628 p->g->nbol++; 629 } 630 while (MORE() && !SEETWO(end1, end2)) { 631 wasdollar = p_simp_re(p, first); 632 first = 0; 633 } 634 if (wasdollar) { /* oops, that was a trailing anchor */ 635 DROP(1); 636 EMIT(OEOL, 0); 637 p->g->iflags |= USEEOL; 638 p->g->neol++; 639 } 640 641 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 642 } 643 644 /* 645 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 646 */ 647 static int /* was the simple RE an unbackslashed $? */ 648 p_simp_re(struct parse *p, 649 int starordinary) /* is a leading * an ordinary character? */ 650 { 651 int c; 652 int count; 653 int count2; 654 sopno pos; 655 int i; 656 sopno subno; 657 #define BACKSL (1 << CHAR_BIT) 658 659 pos = HERE(); /* repetition op, if any, covers from here */ 660 661 assert(MORE()); /* caller should have ensured this */ 662 c = GETNEXT(); 663 if (c == '\\') { 664 REQUIRE(MORE(), REG_EESCAPE); 665 c = BACKSL | GETNEXT(); 666 } 667 switch (c) { 668 case '.': 669 if (p->g->cflags & REG_NEWLINE) 670 nonnewline(p); 671 else 672 EMIT(OANY, 0); 673 break; 674 case '[': 675 p_bracket(p); 676 break; 677 case BACKSL | '{': 678 SETERROR(REG_BADRPT); 679 break; 680 case BACKSL | '(': 681 p->g->nsub++; 682 subno = p->g->nsub; 683 if (subno < NPAREN) 684 p->pbegin[subno] = HERE(); 685 EMIT(OLPAREN, subno); 686 /* the MORE here is an error heuristic */ 687 if (MORE() && !SEETWO('\\', ')')) 688 p_bre(p, '\\', ')'); 689 if (subno < NPAREN) { 690 p->pend[subno] = HERE(); 691 assert(p->pend[subno] != 0); 692 } 693 EMIT(ORPAREN, subno); 694 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 695 break; 696 case BACKSL | ')': /* should not get here -- must be user */ 697 case BACKSL | '}': 698 SETERROR(REG_EPAREN); 699 break; 700 case BACKSL | '1': 701 case BACKSL | '2': 702 case BACKSL | '3': 703 case BACKSL | '4': 704 case BACKSL | '5': 705 case BACKSL | '6': 706 case BACKSL | '7': 707 case BACKSL | '8': 708 case BACKSL | '9': 709 i = (c & ~BACKSL) - '0'; 710 assert(i < NPAREN); 711 if (p->pend[i] != 0) { 712 assert(i <= p->g->nsub); 713 EMIT(OBACK_, i); 714 assert(p->pbegin[i] != 0); 715 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 716 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 717 (void)dupl(p, p->pbegin[i] + 1, p->pend[i]); 718 EMIT(O_BACK, i); 719 } else { 720 SETERROR(REG_ESUBREG); 721 } 722 p->g->backrefs = 1; 723 break; 724 case '*': 725 REQUIRE(starordinary, REG_BADRPT); 726 LLVM_FALLTHROUGH; 727 default: 728 ordinary(p, (char)c); 729 break; 730 } 731 732 if (EAT('*')) { /* implemented as +? */ 733 /* this case does not require the (y|) trick, noKLUDGE */ 734 INSERT(OPLUS_, pos); 735 ASTERN(O_PLUS, pos); 736 INSERT(OQUEST_, pos); 737 ASTERN(O_QUEST, pos); 738 } else if (EATTWO('\\', '{')) { 739 count = p_count(p); 740 if (EAT(',')) { 741 if (MORE() && isdigit((uch)PEEK())) { 742 count2 = p_count(p); 743 REQUIRE(count <= count2, REG_BADBR); 744 } else /* single number with comma */ 745 count2 = REGINFINITY; 746 } else /* just a single number */ 747 count2 = count; 748 repeat(p, pos, count, count2); 749 if (!EATTWO('\\', '}')) { /* error heuristics */ 750 while (MORE() && !SEETWO('\\', '}')) 751 NEXT(); 752 REQUIRE(MORE(), REG_EBRACE); 753 SETERROR(REG_BADBR); 754 } 755 } else if (c == '$') /* $ (but not \$) ends it */ 756 return (1); 757 758 return (0); 759 } 760 761 /* 762 - p_count - parse a repetition count 763 */ 764 static int /* the value */ 765 p_count(struct parse *p) { 766 int count = 0; 767 int ndigits = 0; 768 769 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 770 count = count * 10 + (GETNEXT() - '0'); 771 ndigits++; 772 } 773 774 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 775 return (count); 776 } 777 778 /* 779 - p_bracket - parse a bracketed character list 780 * 781 * Note a significant property of this code: if the allocset() did SETERROR, 782 * no set operations are done. 783 */ 784 static void p_bracket(struct parse *p) { 785 cset *cs; 786 int invert = 0; 787 788 /* Dept of Truly Sickening Special-Case Kludges */ 789 if (p->end - p->next > 5) { 790 if (strncmp(p->next, "[:<:]]", 6) == 0) { 791 EMIT(OBOW, 0); 792 NEXTn(6); 793 return; 794 } 795 if (strncmp(p->next, "[:>:]]", 6) == 0) { 796 EMIT(OEOW, 0); 797 NEXTn(6); 798 return; 799 } 800 } 801 802 if ((cs = allocset(p)) == NULL) { 803 /* allocset did set error status in p */ 804 return; 805 } 806 807 if (EAT('^')) 808 invert++; /* make note to invert set at end */ 809 if (EAT(']')) 810 CHadd(cs, ']'); 811 else if (EAT('-')) 812 CHadd(cs, '-'); 813 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 814 p_b_term(p, cs); 815 if (EAT('-')) 816 CHadd(cs, '-'); 817 MUSTEAT(']', REG_EBRACK); 818 819 if (p->error != 0) { /* don't mess things up further */ 820 freeset(p, cs); 821 return; 822 } 823 824 if (p->g->cflags & REG_ICASE) { 825 int i; 826 int ci; 827 828 for (i = p->g->csetsize - 1; i >= 0; i--) 829 if (CHIN(cs, i) && isalpha(i)) { 830 ci = othercase(i); 831 if (ci != i) 832 CHadd(cs, ci); 833 } 834 if (cs->multis != NULL) 835 mccase(p, cs); 836 } 837 if (invert) { 838 int i; 839 840 for (i = p->g->csetsize - 1; i >= 0; i--) 841 if (CHIN(cs, i)) 842 CHsub(cs, i); 843 else 844 CHadd(cs, i); 845 if (p->g->cflags & REG_NEWLINE) 846 CHsub(cs, '\n'); 847 if (cs->multis != NULL) 848 mcinvert(p, cs); 849 } 850 851 assert(cs->multis == NULL); /* xxx */ 852 853 if (nch(p, cs) == 1) { /* optimize singleton sets */ 854 ordinary(p, firstch(p, cs)); 855 freeset(p, cs); 856 } else { 857 EMIT(OANYOF, freezeset(p, cs)); 858 } 859 } 860 861 /* 862 - p_b_term - parse one term of a bracketed character list 863 */ 864 static void p_b_term(struct parse *p, cset *cs) { 865 char c; 866 char start, finish; 867 int i; 868 869 /* classify what we've got */ 870 switch ((MORE()) ? PEEK() : '\0') { 871 case '[': 872 c = (MORE2()) ? PEEK2() : '\0'; 873 break; 874 case '-': 875 SETERROR(REG_ERANGE); 876 return; /* NOTE RETURN */ 877 break; 878 default: 879 c = '\0'; 880 break; 881 } 882 883 switch (c) { 884 case ':': /* character class */ 885 NEXT2(); 886 REQUIRE(MORE(), REG_EBRACK); 887 c = PEEK(); 888 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 889 p_b_cclass(p, cs); 890 REQUIRE(MORE(), REG_EBRACK); 891 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 892 break; 893 case '=': /* equivalence class */ 894 NEXT2(); 895 REQUIRE(MORE(), REG_EBRACK); 896 c = PEEK(); 897 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 898 p_b_eclass(p, cs); 899 REQUIRE(MORE(), REG_EBRACK); 900 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 901 break; 902 default: /* symbol, ordinary character, or range */ 903 /* xxx revision needed for multichar stuff */ 904 start = p_b_symbol(p); 905 if (SEE('-') && MORE2() && PEEK2() != ']') { 906 /* range */ 907 NEXT(); 908 if (EAT('-')) 909 finish = '-'; 910 else 911 finish = p_b_symbol(p); 912 } else { 913 finish = start; 914 } 915 /* xxx what about signed chars here... */ 916 REQUIRE(start <= finish, REG_ERANGE); 917 for (i = start; i <= finish; i++) 918 CHadd(cs, i); 919 break; 920 } 921 } 922 923 /* 924 - p_b_cclass - parse a character-class name and deal with it 925 */ 926 static void p_b_cclass(struct parse *p, cset *cs) { 927 const char *sp = p->next; 928 struct cclass *cp; 929 size_t len; 930 const char *u; 931 char c; 932 933 while (MORE() && isalpha((uch)PEEK())) 934 NEXT(); 935 len = p->next - sp; 936 for (cp = cclasses; cp->name != NULL; cp++) 937 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 938 break; 939 if (cp->name == NULL) { 940 /* oops, didn't find it */ 941 SETERROR(REG_ECTYPE); 942 return; 943 } 944 945 u = cp->chars; 946 while ((c = *u++) != '\0') 947 CHadd(cs, c); 948 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 949 MCadd(p, cs, u); 950 } 951 952 /* 953 - p_b_eclass - parse an equivalence-class name and deal with it 954 * 955 * This implementation is incomplete. xxx 956 */ 957 static void p_b_eclass(struct parse *p, cset *cs) { 958 char c; 959 960 c = p_b_coll_elem(p, '='); 961 CHadd(cs, c); 962 } 963 964 /* 965 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 966 */ 967 static char /* value of symbol */ 968 p_b_symbol(struct parse *p) { 969 char value; 970 971 REQUIRE(MORE(), REG_EBRACK); 972 if (!EATTWO('[', '.')) 973 return (GETNEXT()); 974 975 /* collating symbol */ 976 value = p_b_coll_elem(p, '.'); 977 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 978 return (value); 979 } 980 981 /* 982 - p_b_coll_elem - parse a collating-element name and look it up 983 */ 984 static char /* value of collating element */ 985 p_b_coll_elem(struct parse *p, int endc) /* name ended by endc,']' */ 986 { 987 const char *sp = p->next; 988 struct cname *cp; 989 size_t len; 990 991 while (MORE() && !SEETWO(endc, ']')) 992 NEXT(); 993 if (!MORE()) { 994 SETERROR(REG_EBRACK); 995 return (0); 996 } 997 len = p->next - sp; 998 for (cp = cnames; cp->name != NULL; cp++) 999 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1000 return (cp->code); /* known name */ 1001 if (len == 1) 1002 return (*sp); /* single character */ 1003 SETERROR(REG_ECOLLATE); /* neither */ 1004 return (0); 1005 } 1006 1007 /* 1008 - othercase - return the case counterpart of an alphabetic 1009 */ 1010 static char /* if no counterpart, return ch */ 1011 othercase(int ch) { 1012 ch = (uch)ch; 1013 assert(isalpha(ch)); 1014 if (isupper(ch)) 1015 return ((uch)tolower(ch)); 1016 else if (islower(ch)) 1017 return ((uch)toupper(ch)); 1018 else /* peculiar, but could happen */ 1019 return (ch); 1020 } 1021 1022 /* 1023 - bothcases - emit a dualcase version of a two-case character 1024 * 1025 * Boy, is this implementation ever a kludge... 1026 */ 1027 static void bothcases(struct parse *p, int ch) { 1028 const char *oldnext = p->next; 1029 const char *oldend = p->end; 1030 char bracket[3]; 1031 1032 ch = (uch)ch; 1033 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1034 p->next = bracket; 1035 p->end = bracket + 2; 1036 bracket[0] = ch; 1037 bracket[1] = ']'; 1038 bracket[2] = '\0'; 1039 p_bracket(p); 1040 assert(p->next == bracket + 2); 1041 p->next = oldnext; 1042 p->end = oldend; 1043 } 1044 1045 /* 1046 - ordinary - emit an ordinary character 1047 */ 1048 static void ordinary(struct parse *p, int ch) { 1049 cat_t *cap = p->g->categories; 1050 1051 if ((p->g->cflags & REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) 1052 bothcases(p, ch); 1053 else { 1054 EMIT(OCHAR, (uch)ch); 1055 if (cap[ch] == 0) 1056 cap[ch] = p->g->ncategories++; 1057 } 1058 } 1059 1060 /* 1061 - nonnewline - emit REG_NEWLINE version of OANY 1062 * 1063 * Boy, is this implementation ever a kludge... 1064 */ 1065 static void nonnewline(struct parse *p) { 1066 const char *oldnext = p->next; 1067 const char *oldend = p->end; 1068 static const char bracket[4] = {'^', '\n', ']', '\0'}; 1069 1070 p->next = bracket; 1071 p->end = bracket + 3; 1072 p_bracket(p); 1073 assert(p->next == bracket + 3); 1074 p->next = oldnext; 1075 p->end = oldend; 1076 } 1077 1078 /* 1079 - repeat - generate code for a bounded repetition, recursively if needed 1080 */ 1081 static void repeat(struct parse *p, 1082 sopno start, /* operand from here to end of strip */ 1083 int from, /* repeated from this number */ 1084 int to) /* to this number of times (maybe INFINITY) */ 1085 { 1086 sopno finish = HERE(); 1087 #define N 2 1088 #define INF 3 1089 #define REP(f, t) ((f) * 8 + (t)) 1090 #define MAP(n) (((n) <= 1) ? (n) : ((n) == REGINFINITY) ? INF : N) 1091 sopno copy; 1092 1093 if (p->error != 0) /* head off possible runaway recursion */ 1094 return; 1095 1096 assert(from <= to); 1097 1098 switch (REP(MAP(from), MAP(to))) { 1099 case REP(0, 0): /* must be user doing this */ 1100 DROP(finish - start); /* drop the operand */ 1101 break; 1102 case REP(0, 1): /* as x{1,1}? */ 1103 case REP(0, N): /* as x{1,n}? */ 1104 case REP(0, INF): /* as x{1,}? */ 1105 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1106 INSERT(OCH_, start); /* offset is wrong... */ 1107 repeat(p, start + 1, 1, to); 1108 ASTERN(OOR1, start); 1109 AHEAD(start); /* ... fix it */ 1110 EMIT(OOR2, 0); 1111 AHEAD(THERE()); 1112 ASTERN(O_CH, THERETHERE()); 1113 break; 1114 case REP(1, 1): /* trivial case */ 1115 /* done */ 1116 break; 1117 case REP(1, N): /* as x?x{1,n-1} */ 1118 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1119 INSERT(OCH_, start); 1120 ASTERN(OOR1, start); 1121 AHEAD(start); 1122 EMIT(OOR2, 0); /* offset very wrong... */ 1123 AHEAD(THERE()); /* ...so fix it */ 1124 ASTERN(O_CH, THERETHERE()); 1125 copy = dupl(p, start + 1, finish + 1); 1126 assert(copy == finish + 4); 1127 repeat(p, copy, 1, to - 1); 1128 break; 1129 case REP(1, INF): /* as x+ */ 1130 INSERT(OPLUS_, start); 1131 ASTERN(O_PLUS, start); 1132 break; 1133 case REP(N, N): /* as xx{m-1,n-1} */ 1134 copy = dupl(p, start, finish); 1135 repeat(p, copy, from - 1, to - 1); 1136 break; 1137 case REP(N, INF): /* as xx{n-1,INF} */ 1138 copy = dupl(p, start, finish); 1139 repeat(p, copy, from - 1, to); 1140 break; 1141 default: /* "can't happen" */ 1142 SETERROR(REG_ASSERT); /* just in case */ 1143 break; 1144 } 1145 } 1146 1147 /* 1148 - seterr - set an error condition 1149 */ 1150 static int /* useless but makes type checking happy */ 1151 seterr(struct parse *p, int e) { 1152 if (p->error == 0) /* keep earliest error condition */ 1153 p->error = e; 1154 p->next = nuls; /* try to bring things to a halt */ 1155 p->end = nuls; 1156 return (0); /* make the return value well-defined */ 1157 } 1158 1159 /* 1160 - allocset - allocate a set of characters for [] 1161 */ 1162 static cset *allocset(struct parse *p) { 1163 int no = p->g->ncsets++; 1164 size_t nc; 1165 size_t nbytes; 1166 cset *cs; 1167 size_t css = (size_t)p->g->csetsize; 1168 int i; 1169 1170 if (no >= p->ncsalloc) { /* need another column of space */ 1171 void *ptr; 1172 1173 p->ncsalloc += CHAR_BIT; 1174 nc = p->ncsalloc; 1175 if (nc > SIZE_MAX / sizeof(cset)) 1176 goto nomem; 1177 assert(nc % CHAR_BIT == 0); 1178 nbytes = nc / CHAR_BIT * css; 1179 1180 ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset)); 1181 if (ptr == NULL) 1182 goto nomem; 1183 p->g->sets = ptr; 1184 1185 ptr = (uch *)realloc((char *)p->g->setbits, nbytes); 1186 if (ptr == NULL) 1187 goto nomem; 1188 p->g->setbits = ptr; 1189 1190 for (i = 0; i < no; i++) 1191 p->g->sets[i].ptr = p->g->setbits + css * (i / CHAR_BIT); 1192 1193 (void)memset((char *)p->g->setbits + (nbytes - css), 0, css); 1194 } 1195 /* XXX should not happen */ 1196 if (p->g->sets == NULL || p->g->setbits == NULL) 1197 goto nomem; 1198 1199 cs = &p->g->sets[no]; 1200 cs->ptr = p->g->setbits + css * ((no) / CHAR_BIT); 1201 cs->mask = 1 << ((no) % CHAR_BIT); 1202 cs->hash = 0; 1203 cs->smultis = 0; 1204 cs->multis = NULL; 1205 1206 return (cs); 1207 nomem: 1208 free(p->g->sets); 1209 p->g->sets = NULL; 1210 free(p->g->setbits); 1211 p->g->setbits = NULL; 1212 1213 SETERROR(REG_ESPACE); 1214 /* caller's responsibility not to do set ops */ 1215 return (NULL); 1216 } 1217 1218 /* 1219 - freeset - free a now-unused set 1220 */ 1221 static void freeset(struct parse *p, cset *cs) { 1222 size_t i; 1223 cset *top = &p->g->sets[p->g->ncsets]; 1224 size_t css = (size_t)p->g->csetsize; 1225 1226 for (i = 0; i < css; i++) 1227 CHsub(cs, i); 1228 if (cs == top - 1) /* recover only the easy case */ 1229 p->g->ncsets--; 1230 } 1231 1232 /* 1233 - freezeset - final processing on a set of characters 1234 * 1235 * The main task here is merging identical sets. This is usually a waste 1236 * of time (although the hash code minimizes the overhead), but can win 1237 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1238 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1239 * the same value! 1240 */ 1241 static int /* set number */ 1242 freezeset(struct parse *p, cset *cs) { 1243 uch h = cs->hash; 1244 size_t i; 1245 cset *top = &p->g->sets[p->g->ncsets]; 1246 cset *cs2; 1247 size_t css = (size_t)p->g->csetsize; 1248 1249 /* look for an earlier one which is the same */ 1250 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1251 if (cs2->hash == h && cs2 != cs) { 1252 /* maybe */ 1253 for (i = 0; i < css; i++) 1254 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1255 break; /* no */ 1256 if (i == css) 1257 break; /* yes */ 1258 } 1259 1260 if (cs2 < top) { /* found one */ 1261 freeset(p, cs); 1262 cs = cs2; 1263 } 1264 1265 return ((int)(cs - p->g->sets)); 1266 } 1267 1268 /* 1269 - firstch - return first character in a set (which must have at least one) 1270 */ 1271 static int /* character; there is no "none" value */ 1272 firstch(struct parse *p, cset *cs) { 1273 size_t i; 1274 size_t css = (size_t)p->g->csetsize; 1275 1276 for (i = 0; i < css; i++) 1277 if (CHIN(cs, i)) 1278 return ((char)i); 1279 assert(never); 1280 return (0); /* arbitrary */ 1281 } 1282 1283 /* 1284 - nch - number of characters in a set 1285 */ 1286 static int nch(struct parse *p, cset *cs) { 1287 size_t i; 1288 size_t css = (size_t)p->g->csetsize; 1289 int n = 0; 1290 1291 for (i = 0; i < css; i++) 1292 if (CHIN(cs, i)) 1293 n++; 1294 return (n); 1295 } 1296 1297 /* 1298 - mcadd - add a collating element to a cset 1299 */ 1300 static void mcadd(struct parse *p, cset *cs, const char *cp) { 1301 size_t oldend = cs->smultis; 1302 void *np; 1303 1304 cs->smultis += strlen(cp) + 1; 1305 np = realloc(cs->multis, cs->smultis); 1306 if (np == NULL) { 1307 if (cs->multis) 1308 free(cs->multis); 1309 cs->multis = NULL; 1310 SETERROR(REG_ESPACE); 1311 return; 1312 } 1313 cs->multis = np; 1314 1315 llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); 1316 } 1317 1318 /* 1319 - mcinvert - invert the list of collating elements in a cset 1320 * 1321 * This would have to know the set of possibilities. Implementation 1322 * is deferred. 1323 */ 1324 /* ARGSUSED */ 1325 static void mcinvert(struct parse *p, cset *cs) { 1326 assert(cs->multis == NULL); /* xxx */ 1327 } 1328 1329 /* 1330 - mccase - add case counterparts of the list of collating elements in a cset 1331 * 1332 * This would have to know the set of possibilities. Implementation 1333 * is deferred. 1334 */ 1335 /* ARGSUSED */ 1336 static void mccase(struct parse *p, cset *cs) { 1337 assert(cs->multis == NULL); /* xxx */ 1338 } 1339 1340 /* 1341 - isinsets - is this character in any sets? 1342 */ 1343 static int /* predicate */ 1344 isinsets(struct re_guts *g, int c) { 1345 uch *col; 1346 int i; 1347 int ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT; 1348 unsigned uc = (uch)c; 1349 1350 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1351 if (col[uc] != 0) 1352 return (1); 1353 return (0); 1354 } 1355 1356 /* 1357 - samesets - are these two characters in exactly the same sets? 1358 */ 1359 static int /* predicate */ 1360 samesets(struct re_guts *g, int c1, int c2) { 1361 uch *col; 1362 int i; 1363 int ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT; 1364 unsigned uc1 = (uch)c1; 1365 unsigned uc2 = (uch)c2; 1366 1367 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1368 if (col[uc1] != col[uc2]) 1369 return (0); 1370 return (1); 1371 } 1372 1373 /* 1374 - categorize - sort out character categories 1375 */ 1376 static void categorize(struct parse *p, struct re_guts *g) { 1377 cat_t *cats = g->categories; 1378 int c; 1379 int c2; 1380 cat_t cat; 1381 1382 /* avoid making error situations worse */ 1383 if (p->error != 0) 1384 return; 1385 1386 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1387 if (cats[c] == 0 && isinsets(g, c)) { 1388 cat = g->ncategories++; 1389 cats[c] = cat; 1390 for (c2 = c + 1; c2 <= CHAR_MAX; c2++) 1391 if (cats[c2] == 0 && samesets(g, c, c2)) 1392 cats[c2] = cat; 1393 } 1394 } 1395 1396 /* 1397 - dupl - emit a duplicate of a bunch of sops 1398 */ 1399 static sopno /* start of duplicate */ 1400 dupl(struct parse *p, sopno start, /* from here */ 1401 sopno finish) /* to this less one */ 1402 { 1403 sopno ret = HERE(); 1404 sopno len = finish - start; 1405 1406 assert(finish >= start); 1407 if (len == 0) 1408 return (ret); 1409 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1410 assert(p->ssize >= p->slen + len); 1411 (void)memmove((char *)(p->strip + p->slen), (char *)(p->strip + start), 1412 (size_t)len * sizeof(sop)); 1413 p->slen += len; 1414 return (ret); 1415 } 1416 1417 /* 1418 - doemit - emit a strip operator 1419 * 1420 * It might seem better to implement this as a macro with a function as 1421 * hard-case backup, but it's just too big and messy unless there are 1422 * some changes to the data structures. Maybe later. 1423 */ 1424 static void doemit(struct parse *p, sop op, size_t opnd) { 1425 /* avoid making error situations worse */ 1426 if (p->error != 0) 1427 return; 1428 1429 /* deal with oversize operands ("can't happen", more or less) */ 1430 assert(opnd < 1 << OPSHIFT); 1431 1432 /* deal with undersized strip */ 1433 if (p->slen >= p->ssize) 1434 enlarge(p, (p->ssize + 1) / 2 * 3); /* +50% */ 1435 assert(p->slen < p->ssize); 1436 1437 /* finally, it's all reduced to the easy case */ 1438 p->strip[p->slen++] = SOP(op, opnd); 1439 } 1440 1441 /* 1442 - doinsert - insert a sop into the strip 1443 */ 1444 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos) { 1445 sopno sn; 1446 sop s; 1447 int i; 1448 1449 /* avoid making error situations worse */ 1450 if (p->error != 0) 1451 return; 1452 1453 sn = HERE(); 1454 EMIT(op, opnd); /* do checks, ensure space */ 1455 assert(HERE() == sn + 1); 1456 s = p->strip[sn]; 1457 1458 /* adjust paren pointers */ 1459 assert(pos > 0); 1460 for (i = 1; i < NPAREN; i++) { 1461 if (p->pbegin[i] >= pos) { 1462 p->pbegin[i]++; 1463 } 1464 if (p->pend[i] >= pos) { 1465 p->pend[i]++; 1466 } 1467 } 1468 1469 memmove((char *)&p->strip[pos + 1], (char *)&p->strip[pos], 1470 (HERE() - pos - 1) * sizeof(sop)); 1471 p->strip[pos] = s; 1472 } 1473 1474 /* 1475 - dofwd - complete a forward reference 1476 */ 1477 static void dofwd(struct parse *p, sopno pos, sop value) { 1478 /* avoid making error situations worse */ 1479 if (p->error != 0) 1480 return; 1481 1482 assert(value < 1 << OPSHIFT); 1483 p->strip[pos] = OP(p->strip[pos]) | value; 1484 } 1485 1486 /* 1487 - enlarge - enlarge the strip 1488 */ 1489 static void enlarge(struct parse *p, sopno size) { 1490 sop *sp; 1491 1492 if (p->ssize >= size) 1493 return; 1494 1495 if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) { 1496 SETERROR(REG_ESPACE); 1497 return; 1498 } 1499 1500 sp = (sop *)realloc(p->strip, size * sizeof(sop)); 1501 if (sp == NULL) { 1502 SETERROR(REG_ESPACE); 1503 return; 1504 } 1505 p->strip = sp; 1506 p->ssize = size; 1507 } 1508 1509 /* 1510 - stripsnug - compact the strip 1511 */ 1512 static void stripsnug(struct parse *p, struct re_guts *g) { 1513 g->nstates = p->slen; 1514 if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) { 1515 g->strip = p->strip; 1516 SETERROR(REG_ESPACE); 1517 return; 1518 } 1519 1520 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1521 if (g->strip == NULL) { 1522 SETERROR(REG_ESPACE); 1523 g->strip = p->strip; 1524 } 1525 } 1526 1527 /* 1528 - findmust - fill in must and mlen with longest mandatory literal string 1529 * 1530 * This algorithm could do fancy things like analyzing the operands of | 1531 * for common subsequences. Someday. This code is simple and finds most 1532 * of the interesting cases. 1533 * 1534 * Note that must and mlen got initialized during setup. 1535 */ 1536 static void findmust(struct parse *p, struct re_guts *g) { 1537 sop *scan; 1538 sop *start = 0; /* start initialized in the default case, after that */ 1539 sop *newstart = 0; /* newstart was initialized in the OCHAR case */ 1540 sopno newlen; 1541 sop s; 1542 char *cp; 1543 sopno i; 1544 1545 /* avoid making error situations worse */ 1546 if (p->error != 0) 1547 return; 1548 1549 /* find the longest OCHAR sequence in strip */ 1550 newlen = 0; 1551 scan = g->strip + 1; 1552 do { 1553 s = *scan++; 1554 switch (OP(s)) { 1555 case OCHAR: /* sequence member */ 1556 if (newlen == 0) /* new sequence */ 1557 newstart = scan - 1; 1558 newlen++; 1559 break; 1560 case OPLUS_: /* things that don't break one */ 1561 case OLPAREN: 1562 case ORPAREN: 1563 break; 1564 case OQUEST_: /* things that must be skipped */ 1565 case OCH_: 1566 scan--; 1567 do { 1568 scan += OPND(s); 1569 s = *scan; 1570 /* assert() interferes w debug printouts */ 1571 if (OP(s) != O_QUEST && OP(s) != O_CH && OP(s) != OOR2) { 1572 g->iflags |= REGEX_BAD; 1573 return; 1574 } 1575 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1576 LLVM_FALLTHROUGH; 1577 default: /* things that break a sequence */ 1578 if (newlen > g->mlen) { /* ends one */ 1579 start = newstart; 1580 g->mlen = newlen; 1581 } 1582 newlen = 0; 1583 break; 1584 } 1585 } while (OP(s) != OEND); 1586 1587 if (g->mlen == 0) /* there isn't one */ 1588 return; 1589 1590 /* turn it into a character string */ 1591 g->must = malloc((size_t)g->mlen + 1); 1592 if (g->must == NULL) { /* argh; just forget it */ 1593 g->mlen = 0; 1594 return; 1595 } 1596 cp = g->must; 1597 scan = start; 1598 for (i = g->mlen; i > 0; i--) { 1599 while (OP(s = *scan++) != OCHAR) 1600 continue; 1601 assert(cp < g->must + g->mlen); 1602 *cp++ = (char)OPND(s); 1603 } 1604 assert(cp == g->must + g->mlen); 1605 *cp++ = '\0'; /* just on general principles */ 1606 } 1607 1608 /* 1609 - pluscount - count + nesting 1610 */ 1611 static sopno /* nesting depth */ 1612 pluscount(struct parse *p, struct re_guts *g) { 1613 sop *scan; 1614 sop s; 1615 sopno plusnest = 0; 1616 sopno maxnest = 0; 1617 1618 if (p->error != 0) 1619 return (0); /* there may not be an OEND */ 1620 1621 scan = g->strip + 1; 1622 do { 1623 s = *scan++; 1624 switch (OP(s)) { 1625 case OPLUS_: 1626 plusnest++; 1627 break; 1628 case O_PLUS: 1629 if (plusnest > maxnest) 1630 maxnest = plusnest; 1631 plusnest--; 1632 break; 1633 } 1634 } while (OP(s) != OEND); 1635 if (plusnest != 0) 1636 g->iflags |= REGEX_BAD; 1637 return (maxnest); 1638 } 1639