1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 38 * 39 * $FreeBSD$ 40 */ 41 42 #if defined(LIBC_SCCS) && !defined(lint) 43 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 44 #endif /* LIBC_SCCS and not lint */ 45 46 #include <sys/types.h> 47 #include <stdio.h> 48 #include <string.h> 49 #include <ctype.h> 50 #include <limits.h> 51 #include <stdlib.h> 52 #include <regex.h> 53 54 #include "collate.h" 55 56 #include "utils.h" 57 #include "regex2.h" 58 59 #include "cclass.h" 60 #include "cname.h" 61 62 /* 63 * parse structure, passed up and down to avoid global variables and 64 * other clumsinesses 65 */ 66 struct parse { 67 char *next; /* next character in RE */ 68 char *end; /* end of string (-> NUL normally) */ 69 int error; /* has an error been seen? */ 70 sop *strip; /* malloced strip */ 71 sopno ssize; /* malloced strip size (allocated) */ 72 sopno slen; /* malloced strip length (used) */ 73 int ncsalloc; /* number of csets allocated */ 74 struct re_guts *g; 75 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 76 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 77 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 78 }; 79 80 /* ========= begin header generated by ./mkh ========= */ 81 #ifdef __cplusplus 82 extern "C" { 83 #endif 84 85 /* === regcomp.c === */ 86 static void p_ere __P((struct parse *p, int stop)); 87 static void p_ere_exp __P((struct parse *p)); 88 static void p_str __P((struct parse *p)); 89 static void p_bre __P((struct parse *p, int end1, int end2)); 90 static int p_simp_re __P((struct parse *p, int starordinary)); 91 static int p_count __P((struct parse *p)); 92 static void p_bracket __P((struct parse *p)); 93 static void p_b_term __P((struct parse *p, cset *cs)); 94 static void p_b_cclass __P((struct parse *p, cset *cs)); 95 static void p_b_eclass __P((struct parse *p, cset *cs)); 96 static char p_b_symbol __P((struct parse *p)); 97 static char p_b_coll_elem __P((struct parse *p, int endc)); 98 static char othercase __P((int ch)); 99 static void bothcases __P((struct parse *p, int ch)); 100 static void ordinary __P((struct parse *p, int ch)); 101 static void nonnewline __P((struct parse *p)); 102 static void repeat __P((struct parse *p, sopno start, int from, int to)); 103 static int seterr __P((struct parse *p, int e)); 104 static cset *allocset __P((struct parse *p)); 105 static void freeset __P((struct parse *p, cset *cs)); 106 static int freezeset __P((struct parse *p, cset *cs)); 107 static int firstch __P((struct parse *p, cset *cs)); 108 static int nch __P((struct parse *p, cset *cs)); 109 static void mcadd __P((struct parse *p, cset *cs, char *cp)); 110 #if used 111 static void mcsub __P((cset *cs, char *cp)); 112 static int mcin __P((cset *cs, char *cp)); 113 static char *mcfind __P((cset *cs, char *cp)); 114 #endif 115 static void mcinvert __P((struct parse *p, cset *cs)); 116 static void mccase __P((struct parse *p, cset *cs)); 117 static int isinsets __P((struct re_guts *g, int c)); 118 static int samesets __P((struct re_guts *g, int c1, int c2)); 119 static void categorize __P((struct parse *p, struct re_guts *g)); 120 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 121 static void doemit __P((struct parse *p, sop op, size_t opnd)); 122 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 123 static void dofwd __P((struct parse *p, sopno pos, sop value)); 124 static void enlarge __P((struct parse *p, sopno size)); 125 static void stripsnug __P((struct parse *p, struct re_guts *g)); 126 static void findmust __P((struct parse *p, struct re_guts *g)); 127 static int altoffset __P((sop *scan, int offset, int mccs)); 128 static void computejumps __P((struct parse *p, struct re_guts *g)); 129 static void computematchjumps __P((struct parse *p, struct re_guts *g)); 130 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 131 132 #ifdef __cplusplus 133 } 134 #endif 135 /* ========= end header generated by ./mkh ========= */ 136 137 static char nuls[10]; /* place to point scanner in event of error */ 138 139 /* 140 * macros for use with parse structure 141 * BEWARE: these know that the parse structure is named `p' !!! 142 */ 143 #define PEEK() (*p->next) 144 #define PEEK2() (*(p->next+1)) 145 #define MORE() (p->next < p->end) 146 #define MORE2() (p->next+1 < p->end) 147 #define SEE(c) (MORE() && PEEK() == (c)) 148 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 149 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 150 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 151 #define NEXT() (p->next++) 152 #define NEXT2() (p->next += 2) 153 #define NEXTn(n) (p->next += (n)) 154 #define GETNEXT() (*p->next++) 155 #define SETERROR(e) seterr(p, (e)) 156 #define REQUIRE(co, e) ((co) || SETERROR(e)) 157 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 158 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 159 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 160 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 161 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 162 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 163 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 164 #define HERE() (p->slen) 165 #define THERE() (p->slen - 1) 166 #define THERETHERE() (p->slen - 2) 167 #define DROP(n) (p->slen -= (n)) 168 169 #ifndef NDEBUG 170 static int never = 0; /* for use in asserts; shuts lint up */ 171 #else 172 #define never 0 /* some <assert.h>s have bugs too */ 173 #endif 174 175 /* Macro used by computejump()/computematchjump() */ 176 #define MIN(a,b) ((a)<(b)?(a):(b)) 177 178 /* 179 - regcomp - interface for parser and compilation 180 = extern int regcomp(regex_t *, const char *, int); 181 = #define REG_BASIC 0000 182 = #define REG_EXTENDED 0001 183 = #define REG_ICASE 0002 184 = #define REG_NOSUB 0004 185 = #define REG_NEWLINE 0010 186 = #define REG_NOSPEC 0020 187 = #define REG_PEND 0040 188 = #define REG_DUMP 0200 189 */ 190 int /* 0 success, otherwise REG_something */ 191 regcomp(preg, pattern, cflags) 192 regex_t *preg; 193 const char *pattern; 194 int cflags; 195 { 196 struct parse pa; 197 register struct re_guts *g; 198 register struct parse *p = &pa; 199 register int i; 200 register size_t len; 201 #ifdef REDEBUG 202 # define GOODFLAGS(f) (f) 203 #else 204 # define GOODFLAGS(f) ((f)&~REG_DUMP) 205 #endif 206 207 cflags = GOODFLAGS(cflags); 208 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 209 return(REG_INVARG); 210 211 if (cflags®_PEND) { 212 if (preg->re_endp < pattern) 213 return(REG_INVARG); 214 len = preg->re_endp - pattern; 215 } else 216 len = strlen((char *)pattern); 217 218 /* do the mallocs early so failure handling is easy */ 219 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 220 (NC-1)*sizeof(cat_t)); 221 if (g == NULL) 222 return(REG_ESPACE); 223 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 224 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 225 p->slen = 0; 226 if (p->strip == NULL) { 227 free((char *)g); 228 return(REG_ESPACE); 229 } 230 231 /* set things up */ 232 p->g = g; 233 p->next = (char *)pattern; /* convenience; we do not modify it */ 234 p->end = p->next + len; 235 p->error = 0; 236 p->ncsalloc = 0; 237 for (i = 0; i < NPAREN; i++) { 238 p->pbegin[i] = 0; 239 p->pend[i] = 0; 240 } 241 g->csetsize = NC; 242 g->sets = NULL; 243 g->setbits = NULL; 244 g->ncsets = 0; 245 g->cflags = cflags; 246 g->iflags = 0; 247 g->nbol = 0; 248 g->neol = 0; 249 g->must = NULL; 250 g->moffset = -1; 251 g->charjump = NULL; 252 g->matchjump = NULL; 253 g->mlen = 0; 254 g->nsub = 0; 255 g->ncategories = 1; /* category 0 is "everything else" */ 256 g->categories = &g->catspace[-(CHAR_MIN)]; 257 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 258 g->backrefs = 0; 259 260 /* do it */ 261 EMIT(OEND, 0); 262 g->firststate = THERE(); 263 if (cflags®_EXTENDED) 264 p_ere(p, OUT); 265 else if (cflags®_NOSPEC) 266 p_str(p); 267 else 268 p_bre(p, OUT, OUT); 269 EMIT(OEND, 0); 270 g->laststate = THERE(); 271 272 /* tidy up loose ends and fill things in */ 273 categorize(p, g); 274 stripsnug(p, g); 275 findmust(p, g); 276 /* only use Boyer-Moore algorithm if the pattern is bigger 277 * than three characters 278 */ 279 if(g->mlen > 3) { 280 computejumps(p, g); 281 computematchjumps(p, g); 282 if(g->matchjump == NULL && g->charjump != NULL) { 283 free(g->charjump); 284 g->charjump = NULL; 285 } 286 } 287 g->nplus = pluscount(p, g); 288 g->magic = MAGIC2; 289 preg->re_nsub = g->nsub; 290 preg->re_g = g; 291 preg->re_magic = MAGIC1; 292 #ifndef REDEBUG 293 /* not debugging, so can't rely on the assert() in regexec() */ 294 if (g->iflags&BAD) 295 SETERROR(REG_ASSERT); 296 #endif 297 298 /* win or lose, we're done */ 299 if (p->error != 0) /* lose */ 300 regfree(preg); 301 return(p->error); 302 } 303 304 /* 305 - p_ere - ERE parser top level, concatenation and alternation 306 == static void p_ere(register struct parse *p, int stop); 307 */ 308 static void 309 p_ere(p, stop) 310 register struct parse *p; 311 int stop; /* character this ERE should end at */ 312 { 313 register char c; 314 register sopno prevback; 315 register sopno prevfwd; 316 register sopno conc; 317 register int first = 1; /* is this the first alternative? */ 318 319 for (;;) { 320 /* do a bunch of concatenated expressions */ 321 conc = HERE(); 322 while (MORE() && (c = PEEK()) != '|' && c != stop) 323 p_ere_exp(p); 324 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 325 326 if (!EAT('|')) 327 break; /* NOTE BREAK OUT */ 328 329 if (first) { 330 INSERT(OCH_, conc); /* offset is wrong */ 331 prevfwd = conc; 332 prevback = conc; 333 first = 0; 334 } 335 ASTERN(OOR1, prevback); 336 prevback = THERE(); 337 AHEAD(prevfwd); /* fix previous offset */ 338 prevfwd = HERE(); 339 EMIT(OOR2, 0); /* offset is very wrong */ 340 } 341 342 if (!first) { /* tail-end fixups */ 343 AHEAD(prevfwd); 344 ASTERN(O_CH, prevback); 345 } 346 347 assert(!MORE() || SEE(stop)); 348 } 349 350 /* 351 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 352 == static void p_ere_exp(register struct parse *p); 353 */ 354 static void 355 p_ere_exp(p) 356 register struct parse *p; 357 { 358 register char c; 359 register sopno pos; 360 register int count; 361 register int count2; 362 register sopno subno; 363 int wascaret = 0; 364 365 assert(MORE()); /* caller should have ensured this */ 366 c = GETNEXT(); 367 368 pos = HERE(); 369 switch (c) { 370 case '(': 371 (void)REQUIRE(MORE(), REG_EPAREN); 372 p->g->nsub++; 373 subno = p->g->nsub; 374 if (subno < NPAREN) 375 p->pbegin[subno] = HERE(); 376 EMIT(OLPAREN, subno); 377 if (!SEE(')')) 378 p_ere(p, ')'); 379 if (subno < NPAREN) { 380 p->pend[subno] = HERE(); 381 assert(p->pend[subno] != 0); 382 } 383 EMIT(ORPAREN, subno); 384 (void)MUSTEAT(')', REG_EPAREN); 385 break; 386 #ifndef POSIX_MISTAKE 387 case ')': /* happens only if no current unmatched ( */ 388 /* 389 * You may ask, why the ifndef? Because I didn't notice 390 * this until slightly too late for 1003.2, and none of the 391 * other 1003.2 regular-expression reviewers noticed it at 392 * all. So an unmatched ) is legal POSIX, at least until 393 * we can get it fixed. 394 */ 395 SETERROR(REG_EPAREN); 396 break; 397 #endif 398 case '^': 399 EMIT(OBOL, 0); 400 p->g->iflags |= USEBOL; 401 p->g->nbol++; 402 wascaret = 1; 403 break; 404 case '$': 405 EMIT(OEOL, 0); 406 p->g->iflags |= USEEOL; 407 p->g->neol++; 408 break; 409 case '|': 410 SETERROR(REG_EMPTY); 411 break; 412 case '*': 413 case '+': 414 case '?': 415 SETERROR(REG_BADRPT); 416 break; 417 case '.': 418 if (p->g->cflags®_NEWLINE) 419 nonnewline(p); 420 else 421 EMIT(OANY, 0); 422 break; 423 case '[': 424 p_bracket(p); 425 break; 426 case '\\': 427 (void)REQUIRE(MORE(), REG_EESCAPE); 428 c = GETNEXT(); 429 ordinary(p, c); 430 break; 431 case '{': /* okay as ordinary except if digit follows */ 432 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 433 /* FALLTHROUGH */ 434 default: 435 ordinary(p, c); 436 break; 437 } 438 439 if (!MORE()) 440 return; 441 c = PEEK(); 442 /* we call { a repetition if followed by a digit */ 443 if (!( c == '*' || c == '+' || c == '?' || 444 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 445 return; /* no repetition, we're done */ 446 NEXT(); 447 448 (void)REQUIRE(!wascaret, REG_BADRPT); 449 switch (c) { 450 case '*': /* implemented as +? */ 451 /* this case does not require the (y|) trick, noKLUDGE */ 452 INSERT(OPLUS_, pos); 453 ASTERN(O_PLUS, pos); 454 INSERT(OQUEST_, pos); 455 ASTERN(O_QUEST, pos); 456 break; 457 case '+': 458 INSERT(OPLUS_, pos); 459 ASTERN(O_PLUS, pos); 460 break; 461 case '?': 462 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 463 INSERT(OCH_, pos); /* offset slightly wrong */ 464 ASTERN(OOR1, pos); /* this one's right */ 465 AHEAD(pos); /* fix the OCH_ */ 466 EMIT(OOR2, 0); /* offset very wrong... */ 467 AHEAD(THERE()); /* ...so fix it */ 468 ASTERN(O_CH, THERETHERE()); 469 break; 470 case '{': 471 count = p_count(p); 472 if (EAT(',')) { 473 if (isdigit((uch)PEEK())) { 474 count2 = p_count(p); 475 (void)REQUIRE(count <= count2, REG_BADBR); 476 } else /* single number with comma */ 477 count2 = INFINITY; 478 } else /* just a single number */ 479 count2 = count; 480 repeat(p, pos, count, count2); 481 if (!EAT('}')) { /* error heuristics */ 482 while (MORE() && PEEK() != '}') 483 NEXT(); 484 (void)REQUIRE(MORE(), REG_EBRACE); 485 SETERROR(REG_BADBR); 486 } 487 break; 488 } 489 490 if (!MORE()) 491 return; 492 c = PEEK(); 493 if (!( c == '*' || c == '+' || c == '?' || 494 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 495 return; 496 SETERROR(REG_BADRPT); 497 } 498 499 /* 500 - p_str - string (no metacharacters) "parser" 501 == static void p_str(register struct parse *p); 502 */ 503 static void 504 p_str(p) 505 register struct parse *p; 506 { 507 (void)REQUIRE(MORE(), REG_EMPTY); 508 while (MORE()) 509 ordinary(p, GETNEXT()); 510 } 511 512 /* 513 - p_bre - BRE parser top level, anchoring and concatenation 514 == static void p_bre(register struct parse *p, register int end1, \ 515 == register int end2); 516 * Giving end1 as OUT essentially eliminates the end1/end2 check. 517 * 518 * This implementation is a bit of a kludge, in that a trailing $ is first 519 * taken as an ordinary character and then revised to be an anchor. The 520 * only undesirable side effect is that '$' gets included as a character 521 * category in such cases. This is fairly harmless; not worth fixing. 522 * The amount of lookahead needed to avoid this kludge is excessive. 523 */ 524 static void 525 p_bre(p, end1, end2) 526 register struct parse *p; 527 register int end1; /* first terminating character */ 528 register int end2; /* second terminating character */ 529 { 530 register sopno start = HERE(); 531 register int first = 1; /* first subexpression? */ 532 register int wasdollar = 0; 533 534 if (EAT('^')) { 535 EMIT(OBOL, 0); 536 p->g->iflags |= USEBOL; 537 p->g->nbol++; 538 } 539 while (MORE() && !SEETWO(end1, end2)) { 540 wasdollar = p_simp_re(p, first); 541 first = 0; 542 } 543 if (wasdollar) { /* oops, that was a trailing anchor */ 544 DROP(1); 545 EMIT(OEOL, 0); 546 p->g->iflags |= USEEOL; 547 p->g->neol++; 548 } 549 550 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 551 } 552 553 /* 554 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 555 == static int p_simp_re(register struct parse *p, int starordinary); 556 */ 557 static int /* was the simple RE an unbackslashed $? */ 558 p_simp_re(p, starordinary) 559 register struct parse *p; 560 int starordinary; /* is a leading * an ordinary character? */ 561 { 562 register int c; 563 register int count; 564 register int count2; 565 register sopno pos; 566 register int i; 567 register sopno subno; 568 # define BACKSL (1<<CHAR_BIT) 569 570 pos = HERE(); /* repetion op, if any, covers from here */ 571 572 assert(MORE()); /* caller should have ensured this */ 573 c = GETNEXT(); 574 if (c == '\\') { 575 (void)REQUIRE(MORE(), REG_EESCAPE); 576 c = BACKSL | GETNEXT(); 577 } 578 switch (c) { 579 case '.': 580 if (p->g->cflags®_NEWLINE) 581 nonnewline(p); 582 else 583 EMIT(OANY, 0); 584 break; 585 case '[': 586 p_bracket(p); 587 break; 588 case BACKSL|'{': 589 SETERROR(REG_BADRPT); 590 break; 591 case BACKSL|'(': 592 p->g->nsub++; 593 subno = p->g->nsub; 594 if (subno < NPAREN) 595 p->pbegin[subno] = HERE(); 596 EMIT(OLPAREN, subno); 597 /* the MORE here is an error heuristic */ 598 if (MORE() && !SEETWO('\\', ')')) 599 p_bre(p, '\\', ')'); 600 if (subno < NPAREN) { 601 p->pend[subno] = HERE(); 602 assert(p->pend[subno] != 0); 603 } 604 EMIT(ORPAREN, subno); 605 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 606 break; 607 case BACKSL|')': /* should not get here -- must be user */ 608 case BACKSL|'}': 609 SETERROR(REG_EPAREN); 610 break; 611 case BACKSL|'1': 612 case BACKSL|'2': 613 case BACKSL|'3': 614 case BACKSL|'4': 615 case BACKSL|'5': 616 case BACKSL|'6': 617 case BACKSL|'7': 618 case BACKSL|'8': 619 case BACKSL|'9': 620 i = (c&~BACKSL) - '0'; 621 assert(i < NPAREN); 622 if (p->pend[i] != 0) { 623 assert(i <= p->g->nsub); 624 EMIT(OBACK_, i); 625 assert(p->pbegin[i] != 0); 626 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 627 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 628 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 629 EMIT(O_BACK, i); 630 } else 631 SETERROR(REG_ESUBREG); 632 p->g->backrefs = 1; 633 break; 634 case '*': 635 (void)REQUIRE(starordinary, REG_BADRPT); 636 /* FALLTHROUGH */ 637 default: 638 ordinary(p, (char)c); 639 break; 640 } 641 642 if (EAT('*')) { /* implemented as +? */ 643 /* this case does not require the (y|) trick, noKLUDGE */ 644 INSERT(OPLUS_, pos); 645 ASTERN(O_PLUS, pos); 646 INSERT(OQUEST_, pos); 647 ASTERN(O_QUEST, pos); 648 } else if (EATTWO('\\', '{')) { 649 count = p_count(p); 650 if (EAT(',')) { 651 if (MORE() && isdigit((uch)PEEK())) { 652 count2 = p_count(p); 653 (void)REQUIRE(count <= count2, REG_BADBR); 654 } else /* single number with comma */ 655 count2 = INFINITY; 656 } else /* just a single number */ 657 count2 = count; 658 repeat(p, pos, count, count2); 659 if (!EATTWO('\\', '}')) { /* error heuristics */ 660 while (MORE() && !SEETWO('\\', '}')) 661 NEXT(); 662 (void)REQUIRE(MORE(), REG_EBRACE); 663 SETERROR(REG_BADBR); 664 } 665 } else if (c == '$') /* $ (but not \$) ends it */ 666 return(1); 667 668 return(0); 669 } 670 671 /* 672 - p_count - parse a repetition count 673 == static int p_count(register struct parse *p); 674 */ 675 static int /* the value */ 676 p_count(p) 677 register struct parse *p; 678 { 679 register int count = 0; 680 register int ndigits = 0; 681 682 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 683 count = count*10 + (GETNEXT() - '0'); 684 ndigits++; 685 } 686 687 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 688 return(count); 689 } 690 691 /* 692 - p_bracket - parse a bracketed character list 693 == static void p_bracket(register struct parse *p); 694 * 695 * Note a significant property of this code: if the allocset() did SETERROR, 696 * no set operations are done. 697 */ 698 static void 699 p_bracket(p) 700 register struct parse *p; 701 { 702 register cset *cs = allocset(p); 703 register int invert = 0; 704 705 /* Dept of Truly Sickening Special-Case Kludges */ 706 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 707 EMIT(OBOW, 0); 708 NEXTn(6); 709 return; 710 } 711 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 712 EMIT(OEOW, 0); 713 NEXTn(6); 714 return; 715 } 716 717 if (EAT('^')) 718 invert++; /* make note to invert set at end */ 719 if (EAT(']')) 720 CHadd(cs, ']'); 721 else if (EAT('-')) 722 CHadd(cs, '-'); 723 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 724 p_b_term(p, cs); 725 if (EAT('-')) 726 CHadd(cs, '-'); 727 (void)MUSTEAT(']', REG_EBRACK); 728 729 if (p->error != 0) /* don't mess things up further */ 730 return; 731 732 if (p->g->cflags®_ICASE) { 733 register int i; 734 register int ci; 735 736 for (i = p->g->csetsize - 1; i >= 0; i--) 737 if (CHIN(cs, i) && isalpha(i)) { 738 ci = othercase(i); 739 if (ci != i) 740 CHadd(cs, ci); 741 } 742 if (cs->multis != NULL) 743 mccase(p, cs); 744 } 745 if (invert) { 746 register int i; 747 748 for (i = p->g->csetsize - 1; i >= 0; i--) 749 if (CHIN(cs, i)) 750 CHsub(cs, i); 751 else 752 CHadd(cs, i); 753 if (p->g->cflags®_NEWLINE) 754 CHsub(cs, '\n'); 755 if (cs->multis != NULL) 756 mcinvert(p, cs); 757 } 758 759 assert(cs->multis == NULL); /* xxx */ 760 761 if (nch(p, cs) == 1) { /* optimize singleton sets */ 762 ordinary(p, firstch(p, cs)); 763 freeset(p, cs); 764 } else 765 EMIT(OANYOF, freezeset(p, cs)); 766 } 767 768 /* 769 - p_b_term - parse one term of a bracketed character list 770 == static void p_b_term(register struct parse *p, register cset *cs); 771 */ 772 static void 773 p_b_term(p, cs) 774 register struct parse *p; 775 register cset *cs; 776 { 777 register char c; 778 register char start, finish; 779 register int i; 780 781 /* classify what we've got */ 782 switch ((MORE()) ? PEEK() : '\0') { 783 case '[': 784 c = (MORE2()) ? PEEK2() : '\0'; 785 break; 786 case '-': 787 SETERROR(REG_ERANGE); 788 return; /* NOTE RETURN */ 789 break; 790 default: 791 c = '\0'; 792 break; 793 } 794 795 switch (c) { 796 case ':': /* character class */ 797 NEXT2(); 798 (void)REQUIRE(MORE(), REG_EBRACK); 799 c = PEEK(); 800 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 801 p_b_cclass(p, cs); 802 (void)REQUIRE(MORE(), REG_EBRACK); 803 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 804 break; 805 case '=': /* equivalence class */ 806 NEXT2(); 807 (void)REQUIRE(MORE(), REG_EBRACK); 808 c = PEEK(); 809 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 810 p_b_eclass(p, cs); 811 (void)REQUIRE(MORE(), REG_EBRACK); 812 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 813 break; 814 default: /* symbol, ordinary character, or range */ 815 /* xxx revision needed for multichar stuff */ 816 start = p_b_symbol(p); 817 if (SEE('-') && MORE2() && PEEK2() != ']') { 818 /* range */ 819 NEXT(); 820 if (EAT('-')) 821 finish = '-'; 822 else 823 finish = p_b_symbol(p); 824 } else 825 finish = start; 826 if (start == finish) 827 CHadd(cs, start); 828 else { 829 if (__collate_load_error) { 830 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 831 for (i = (uch)start; i <= (uch)finish; i++) 832 CHadd(cs, i); 833 } else { 834 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); 835 for (i = CHAR_MIN; i <= CHAR_MAX; i++) { 836 if ( __collate_range_cmp(start, i) <= 0 837 && __collate_range_cmp(i, finish) <= 0 838 ) 839 CHadd(cs, i); 840 } 841 } 842 } 843 break; 844 } 845 } 846 847 /* 848 - p_b_cclass - parse a character-class name and deal with it 849 == static void p_b_cclass(register struct parse *p, register cset *cs); 850 */ 851 static void 852 p_b_cclass(p, cs) 853 register struct parse *p; 854 register cset *cs; 855 { 856 register int c; 857 register char *sp = p->next; 858 register struct cclass *cp; 859 register size_t len; 860 861 while (MORE() && isalpha((uch)PEEK())) 862 NEXT(); 863 len = p->next - sp; 864 for (cp = cclasses; cp->name != NULL; cp++) 865 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 866 break; 867 if (cp->name == NULL) { 868 /* oops, didn't find it */ 869 SETERROR(REG_ECTYPE); 870 return; 871 } 872 873 switch (cp->fidx) { 874 case CALNUM: 875 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 876 if (isalnum((uch)c)) 877 CHadd(cs, c); 878 break; 879 case CALPHA: 880 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 881 if (isalpha((uch)c)) 882 CHadd(cs, c); 883 break; 884 case CBLANK: 885 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 886 if (isblank((uch)c)) 887 CHadd(cs, c); 888 break; 889 case CCNTRL: 890 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 891 if (iscntrl((uch)c)) 892 CHadd(cs, c); 893 break; 894 case CDIGIT: 895 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 896 if (isdigit((uch)c)) 897 CHadd(cs, c); 898 break; 899 case CGRAPH: 900 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 901 if (isgraph((uch)c)) 902 CHadd(cs, c); 903 break; 904 case CLOWER: 905 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 906 if (islower((uch)c)) 907 CHadd(cs, c); 908 break; 909 case CPRINT: 910 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 911 if (isprint((uch)c)) 912 CHadd(cs, c); 913 break; 914 case CPUNCT: 915 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 916 if (ispunct((uch)c)) 917 CHadd(cs, c); 918 break; 919 case CSPACE: 920 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 921 if (isspace((uch)c)) 922 CHadd(cs, c); 923 break; 924 case CUPPER: 925 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 926 if (isupper((uch)c)) 927 CHadd(cs, c); 928 break; 929 case CXDIGIT: 930 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 931 if (isxdigit((uch)c)) 932 CHadd(cs, c); 933 break; 934 } 935 #if 0 936 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 937 MCadd(p, cs, u); 938 #endif 939 } 940 941 /* 942 - p_b_eclass - parse an equivalence-class name and deal with it 943 == static void p_b_eclass(register struct parse *p, register cset *cs); 944 * 945 * This implementation is incomplete. xxx 946 */ 947 static void 948 p_b_eclass(p, cs) 949 register struct parse *p; 950 register cset *cs; 951 { 952 register char c; 953 954 c = p_b_coll_elem(p, '='); 955 CHadd(cs, c); 956 } 957 958 /* 959 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 960 == static char p_b_symbol(register struct parse *p); 961 */ 962 static char /* value of symbol */ 963 p_b_symbol(p) 964 register struct parse *p; 965 { 966 register char value; 967 968 (void)REQUIRE(MORE(), REG_EBRACK); 969 if (!EATTWO('[', '.')) 970 return(GETNEXT()); 971 972 /* collating symbol */ 973 value = p_b_coll_elem(p, '.'); 974 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 975 return(value); 976 } 977 978 /* 979 - p_b_coll_elem - parse a collating-element name and look it up 980 == static char p_b_coll_elem(register struct parse *p, int endc); 981 */ 982 static char /* value of collating element */ 983 p_b_coll_elem(p, endc) 984 register struct parse *p; 985 int endc; /* name ended by endc,']' */ 986 { 987 register char *sp = p->next; 988 register struct cname *cp; 989 register int len; 990 991 while (MORE() && !SEETWO(endc, ']')) 992 NEXT(); 993 if (!MORE()) { 994 SETERROR(REG_EBRACK); 995 return(0); 996 } 997 len = p->next - sp; 998 for (cp = cnames; cp->name != NULL; cp++) 999 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 1000 return(cp->code); /* known name */ 1001 if (len == 1) 1002 return(*sp); /* single character */ 1003 SETERROR(REG_ECOLLATE); /* neither */ 1004 return(0); 1005 } 1006 1007 /* 1008 - othercase - return the case counterpart of an alphabetic 1009 == static char othercase(int ch); 1010 */ 1011 static char /* if no counterpart, return ch */ 1012 othercase(ch) 1013 int ch; 1014 { 1015 ch = (uch)ch; 1016 assert(isalpha(ch)); 1017 if (isupper(ch)) 1018 return(tolower(ch)); 1019 else if (islower(ch)) 1020 return(toupper(ch)); 1021 else /* peculiar, but could happen */ 1022 return(ch); 1023 } 1024 1025 /* 1026 - bothcases - emit a dualcase version of a two-case character 1027 == static void bothcases(register struct parse *p, int ch); 1028 * 1029 * Boy, is this implementation ever a kludge... 1030 */ 1031 static void 1032 bothcases(p, ch) 1033 register struct parse *p; 1034 int ch; 1035 { 1036 register char *oldnext = p->next; 1037 register char *oldend = p->end; 1038 char bracket[3]; 1039 1040 ch = (uch)ch; 1041 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1042 p->next = bracket; 1043 p->end = bracket+2; 1044 bracket[0] = ch; 1045 bracket[1] = ']'; 1046 bracket[2] = '\0'; 1047 p_bracket(p); 1048 assert(p->next == bracket+2); 1049 p->next = oldnext; 1050 p->end = oldend; 1051 } 1052 1053 /* 1054 - ordinary - emit an ordinary character 1055 == static void ordinary(register struct parse *p, register int ch); 1056 */ 1057 static void 1058 ordinary(p, ch) 1059 register struct parse *p; 1060 register int ch; 1061 { 1062 register cat_t *cap = p->g->categories; 1063 1064 if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) 1065 bothcases(p, ch); 1066 else { 1067 EMIT(OCHAR, (uch)ch); 1068 if (cap[ch] == 0) 1069 cap[ch] = p->g->ncategories++; 1070 } 1071 } 1072 1073 /* 1074 - nonnewline - emit REG_NEWLINE version of OANY 1075 == static void nonnewline(register struct parse *p); 1076 * 1077 * Boy, is this implementation ever a kludge... 1078 */ 1079 static void 1080 nonnewline(p) 1081 register struct parse *p; 1082 { 1083 register char *oldnext = p->next; 1084 register char *oldend = p->end; 1085 char bracket[4]; 1086 1087 p->next = bracket; 1088 p->end = bracket+3; 1089 bracket[0] = '^'; 1090 bracket[1] = '\n'; 1091 bracket[2] = ']'; 1092 bracket[3] = '\0'; 1093 p_bracket(p); 1094 assert(p->next == bracket+3); 1095 p->next = oldnext; 1096 p->end = oldend; 1097 } 1098 1099 /* 1100 - repeat - generate code for a bounded repetition, recursively if needed 1101 == static void repeat(register struct parse *p, sopno start, int from, int to); 1102 */ 1103 static void 1104 repeat(p, start, from, to) 1105 register struct parse *p; 1106 sopno start; /* operand from here to end of strip */ 1107 int from; /* repeated from this number */ 1108 int to; /* to this number of times (maybe INFINITY) */ 1109 { 1110 register sopno finish = HERE(); 1111 # define N 2 1112 # define INF 3 1113 # define REP(f, t) ((f)*8 + (t)) 1114 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1115 register sopno copy; 1116 1117 if (p->error != 0) /* head off possible runaway recursion */ 1118 return; 1119 1120 assert(from <= to); 1121 1122 switch (REP(MAP(from), MAP(to))) { 1123 case REP(0, 0): /* must be user doing this */ 1124 DROP(finish-start); /* drop the operand */ 1125 break; 1126 case REP(0, 1): /* as x{1,1}? */ 1127 case REP(0, N): /* as x{1,n}? */ 1128 case REP(0, INF): /* as x{1,}? */ 1129 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1130 INSERT(OCH_, start); /* offset is wrong... */ 1131 repeat(p, start+1, 1, to); 1132 ASTERN(OOR1, start); 1133 AHEAD(start); /* ... fix it */ 1134 EMIT(OOR2, 0); 1135 AHEAD(THERE()); 1136 ASTERN(O_CH, THERETHERE()); 1137 break; 1138 case REP(1, 1): /* trivial case */ 1139 /* done */ 1140 break; 1141 case REP(1, N): /* as x?x{1,n-1} */ 1142 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1143 INSERT(OCH_, start); 1144 ASTERN(OOR1, start); 1145 AHEAD(start); 1146 EMIT(OOR2, 0); /* offset very wrong... */ 1147 AHEAD(THERE()); /* ...so fix it */ 1148 ASTERN(O_CH, THERETHERE()); 1149 copy = dupl(p, start+1, finish+1); 1150 assert(copy == finish+4); 1151 repeat(p, copy, 1, to-1); 1152 break; 1153 case REP(1, INF): /* as x+ */ 1154 INSERT(OPLUS_, start); 1155 ASTERN(O_PLUS, start); 1156 break; 1157 case REP(N, N): /* as xx{m-1,n-1} */ 1158 copy = dupl(p, start, finish); 1159 repeat(p, copy, from-1, to-1); 1160 break; 1161 case REP(N, INF): /* as xx{n-1,INF} */ 1162 copy = dupl(p, start, finish); 1163 repeat(p, copy, from-1, to); 1164 break; 1165 default: /* "can't happen" */ 1166 SETERROR(REG_ASSERT); /* just in case */ 1167 break; 1168 } 1169 } 1170 1171 /* 1172 - seterr - set an error condition 1173 == static int seterr(register struct parse *p, int e); 1174 */ 1175 static int /* useless but makes type checking happy */ 1176 seterr(p, e) 1177 register struct parse *p; 1178 int e; 1179 { 1180 if (p->error == 0) /* keep earliest error condition */ 1181 p->error = e; 1182 p->next = nuls; /* try to bring things to a halt */ 1183 p->end = nuls; 1184 return(0); /* make the return value well-defined */ 1185 } 1186 1187 /* 1188 - allocset - allocate a set of characters for [] 1189 == static cset *allocset(register struct parse *p); 1190 */ 1191 static cset * 1192 allocset(p) 1193 register struct parse *p; 1194 { 1195 register int no = p->g->ncsets++; 1196 register size_t nc; 1197 register size_t nbytes; 1198 register cset *cs; 1199 register size_t css = (size_t)p->g->csetsize; 1200 register int i; 1201 1202 if (no >= p->ncsalloc) { /* need another column of space */ 1203 p->ncsalloc += CHAR_BIT; 1204 nc = p->ncsalloc; 1205 assert(nc % CHAR_BIT == 0); 1206 nbytes = nc / CHAR_BIT * css; 1207 if (p->g->sets == NULL) 1208 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1209 else 1210 p->g->sets = (cset *)reallocf((char *)p->g->sets, 1211 nc * sizeof(cset)); 1212 if (p->g->setbits == NULL) 1213 p->g->setbits = (uch *)malloc(nbytes); 1214 else { 1215 p->g->setbits = (uch *)reallocf((char *)p->g->setbits, 1216 nbytes); 1217 /* xxx this isn't right if setbits is now NULL */ 1218 for (i = 0; i < no; i++) 1219 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1220 } 1221 if (p->g->sets != NULL && p->g->setbits != NULL) 1222 (void) memset((char *)p->g->setbits + (nbytes - css), 1223 0, css); 1224 else { 1225 no = 0; 1226 SETERROR(REG_ESPACE); 1227 /* caller's responsibility not to do set ops */ 1228 } 1229 } 1230 1231 assert(p->g->sets != NULL); /* xxx */ 1232 cs = &p->g->sets[no]; 1233 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1234 cs->mask = 1 << ((no) % CHAR_BIT); 1235 cs->hash = 0; 1236 cs->smultis = 0; 1237 cs->multis = NULL; 1238 1239 return(cs); 1240 } 1241 1242 /* 1243 - freeset - free a now-unused set 1244 == static void freeset(register struct parse *p, register cset *cs); 1245 */ 1246 static void 1247 freeset(p, cs) 1248 register struct parse *p; 1249 register cset *cs; 1250 { 1251 register int i; 1252 register cset *top = &p->g->sets[p->g->ncsets]; 1253 register size_t css = (size_t)p->g->csetsize; 1254 1255 for (i = 0; i < css; i++) 1256 CHsub(cs, i); 1257 if (cs == top-1) /* recover only the easy case */ 1258 p->g->ncsets--; 1259 } 1260 1261 /* 1262 - freezeset - final processing on a set of characters 1263 == static int freezeset(register struct parse *p, register cset *cs); 1264 * 1265 * The main task here is merging identical sets. This is usually a waste 1266 * of time (although the hash code minimizes the overhead), but can win 1267 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1268 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1269 * the same value! 1270 */ 1271 static int /* set number */ 1272 freezeset(p, cs) 1273 register struct parse *p; 1274 register cset *cs; 1275 { 1276 register short h = cs->hash; 1277 register int i; 1278 register cset *top = &p->g->sets[p->g->ncsets]; 1279 register cset *cs2; 1280 register size_t css = (size_t)p->g->csetsize; 1281 1282 /* look for an earlier one which is the same */ 1283 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1284 if (cs2->hash == h && cs2 != cs) { 1285 /* maybe */ 1286 for (i = 0; i < css; i++) 1287 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1288 break; /* no */ 1289 if (i == css) 1290 break; /* yes */ 1291 } 1292 1293 if (cs2 < top) { /* found one */ 1294 freeset(p, cs); 1295 cs = cs2; 1296 } 1297 1298 return((int)(cs - p->g->sets)); 1299 } 1300 1301 /* 1302 - firstch - return first character in a set (which must have at least one) 1303 == static int firstch(register struct parse *p, register cset *cs); 1304 */ 1305 static int /* character; there is no "none" value */ 1306 firstch(p, cs) 1307 register struct parse *p; 1308 register cset *cs; 1309 { 1310 register int i; 1311 register size_t css = (size_t)p->g->csetsize; 1312 1313 for (i = 0; i < css; i++) 1314 if (CHIN(cs, i)) 1315 return((char)i); 1316 assert(never); 1317 return(0); /* arbitrary */ 1318 } 1319 1320 /* 1321 - nch - number of characters in a set 1322 == static int nch(register struct parse *p, register cset *cs); 1323 */ 1324 static int 1325 nch(p, cs) 1326 register struct parse *p; 1327 register cset *cs; 1328 { 1329 register int i; 1330 register size_t css = (size_t)p->g->csetsize; 1331 register int n = 0; 1332 1333 for (i = 0; i < css; i++) 1334 if (CHIN(cs, i)) 1335 n++; 1336 return(n); 1337 } 1338 1339 /* 1340 - mcadd - add a collating element to a cset 1341 == static void mcadd(register struct parse *p, register cset *cs, \ 1342 == register char *cp); 1343 */ 1344 static void 1345 mcadd(p, cs, cp) 1346 register struct parse *p; 1347 register cset *cs; 1348 register char *cp; 1349 { 1350 register size_t oldend = cs->smultis; 1351 1352 cs->smultis += strlen(cp) + 1; 1353 if (cs->multis == NULL) 1354 cs->multis = malloc(cs->smultis); 1355 else 1356 cs->multis = reallocf(cs->multis, cs->smultis); 1357 if (cs->multis == NULL) { 1358 SETERROR(REG_ESPACE); 1359 return; 1360 } 1361 1362 (void) strcpy(cs->multis + oldend - 1, cp); 1363 cs->multis[cs->smultis - 1] = '\0'; 1364 } 1365 1366 #if used 1367 /* 1368 - mcsub - subtract a collating element from a cset 1369 == static void mcsub(register cset *cs, register char *cp); 1370 */ 1371 static void 1372 mcsub(cs, cp) 1373 register cset *cs; 1374 register char *cp; 1375 { 1376 register char *fp = mcfind(cs, cp); 1377 register size_t len = strlen(fp); 1378 1379 assert(fp != NULL); 1380 (void) memmove(fp, fp + len + 1, 1381 cs->smultis - (fp + len + 1 - cs->multis)); 1382 cs->smultis -= len; 1383 1384 if (cs->smultis == 0) { 1385 free(cs->multis); 1386 cs->multis = NULL; 1387 return; 1388 } 1389 1390 cs->multis = reallocf(cs->multis, cs->smultis); 1391 assert(cs->multis != NULL); 1392 } 1393 1394 /* 1395 - mcin - is a collating element in a cset? 1396 == static int mcin(register cset *cs, register char *cp); 1397 */ 1398 static int 1399 mcin(cs, cp) 1400 register cset *cs; 1401 register char *cp; 1402 { 1403 return(mcfind(cs, cp) != NULL); 1404 } 1405 1406 /* 1407 - mcfind - find a collating element in a cset 1408 == static char *mcfind(register cset *cs, register char *cp); 1409 */ 1410 static char * 1411 mcfind(cs, cp) 1412 register cset *cs; 1413 register char *cp; 1414 { 1415 register char *p; 1416 1417 if (cs->multis == NULL) 1418 return(NULL); 1419 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1420 if (strcmp(cp, p) == 0) 1421 return(p); 1422 return(NULL); 1423 } 1424 #endif 1425 1426 /* 1427 - mcinvert - invert the list of collating elements in a cset 1428 == static void mcinvert(register struct parse *p, register cset *cs); 1429 * 1430 * This would have to know the set of possibilities. Implementation 1431 * is deferred. 1432 */ 1433 static void 1434 mcinvert(p, cs) 1435 register struct parse *p; 1436 register cset *cs; 1437 { 1438 assert(cs->multis == NULL); /* xxx */ 1439 } 1440 1441 /* 1442 - mccase - add case counterparts of the list of collating elements in a cset 1443 == static void mccase(register struct parse *p, register cset *cs); 1444 * 1445 * This would have to know the set of possibilities. Implementation 1446 * is deferred. 1447 */ 1448 static void 1449 mccase(p, cs) 1450 register struct parse *p; 1451 register cset *cs; 1452 { 1453 assert(cs->multis == NULL); /* xxx */ 1454 } 1455 1456 /* 1457 - isinsets - is this character in any sets? 1458 == static int isinsets(register struct re_guts *g, int c); 1459 */ 1460 static int /* predicate */ 1461 isinsets(g, c) 1462 register struct re_guts *g; 1463 int c; 1464 { 1465 register uch *col; 1466 register int i; 1467 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1468 register unsigned uc = (uch)c; 1469 1470 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1471 if (col[uc] != 0) 1472 return(1); 1473 return(0); 1474 } 1475 1476 /* 1477 - samesets - are these two characters in exactly the same sets? 1478 == static int samesets(register struct re_guts *g, int c1, int c2); 1479 */ 1480 static int /* predicate */ 1481 samesets(g, c1, c2) 1482 register struct re_guts *g; 1483 int c1; 1484 int c2; 1485 { 1486 register uch *col; 1487 register int i; 1488 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1489 register unsigned uc1 = (uch)c1; 1490 register unsigned uc2 = (uch)c2; 1491 1492 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1493 if (col[uc1] != col[uc2]) 1494 return(0); 1495 return(1); 1496 } 1497 1498 /* 1499 - categorize - sort out character categories 1500 == static void categorize(struct parse *p, register struct re_guts *g); 1501 */ 1502 static void 1503 categorize(p, g) 1504 struct parse *p; 1505 register struct re_guts *g; 1506 { 1507 register cat_t *cats = g->categories; 1508 register int c; 1509 register int c2; 1510 register cat_t cat; 1511 1512 /* avoid making error situations worse */ 1513 if (p->error != 0) 1514 return; 1515 1516 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1517 if (cats[c] == 0 && isinsets(g, c)) { 1518 cat = g->ncategories++; 1519 cats[c] = cat; 1520 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1521 if (cats[c2] == 0 && samesets(g, c, c2)) 1522 cats[c2] = cat; 1523 } 1524 } 1525 1526 /* 1527 - dupl - emit a duplicate of a bunch of sops 1528 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1529 */ 1530 static sopno /* start of duplicate */ 1531 dupl(p, start, finish) 1532 register struct parse *p; 1533 sopno start; /* from here */ 1534 sopno finish; /* to this less one */ 1535 { 1536 register sopno ret = HERE(); 1537 register sopno len = finish - start; 1538 1539 assert(finish >= start); 1540 if (len == 0) 1541 return(ret); 1542 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1543 assert(p->ssize >= p->slen + len); 1544 (void) memcpy((char *)(p->strip + p->slen), 1545 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1546 p->slen += len; 1547 return(ret); 1548 } 1549 1550 /* 1551 - doemit - emit a strip operator 1552 == static void doemit(register struct parse *p, sop op, size_t opnd); 1553 * 1554 * It might seem better to implement this as a macro with a function as 1555 * hard-case backup, but it's just too big and messy unless there are 1556 * some changes to the data structures. Maybe later. 1557 */ 1558 static void 1559 doemit(p, op, opnd) 1560 register struct parse *p; 1561 sop op; 1562 size_t opnd; 1563 { 1564 /* avoid making error situations worse */ 1565 if (p->error != 0) 1566 return; 1567 1568 /* deal with oversize operands ("can't happen", more or less) */ 1569 assert(opnd < 1<<OPSHIFT); 1570 1571 /* deal with undersized strip */ 1572 if (p->slen >= p->ssize) 1573 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1574 assert(p->slen < p->ssize); 1575 1576 /* finally, it's all reduced to the easy case */ 1577 p->strip[p->slen++] = SOP(op, opnd); 1578 } 1579 1580 /* 1581 - doinsert - insert a sop into the strip 1582 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1583 */ 1584 static void 1585 doinsert(p, op, opnd, pos) 1586 register struct parse *p; 1587 sop op; 1588 size_t opnd; 1589 sopno pos; 1590 { 1591 register sopno sn; 1592 register sop s; 1593 register int i; 1594 1595 /* avoid making error situations worse */ 1596 if (p->error != 0) 1597 return; 1598 1599 sn = HERE(); 1600 EMIT(op, opnd); /* do checks, ensure space */ 1601 assert(HERE() == sn+1); 1602 s = p->strip[sn]; 1603 1604 /* adjust paren pointers */ 1605 assert(pos > 0); 1606 for (i = 1; i < NPAREN; i++) { 1607 if (p->pbegin[i] >= pos) { 1608 p->pbegin[i]++; 1609 } 1610 if (p->pend[i] >= pos) { 1611 p->pend[i]++; 1612 } 1613 } 1614 1615 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1616 (HERE()-pos-1)*sizeof(sop)); 1617 p->strip[pos] = s; 1618 } 1619 1620 /* 1621 - dofwd - complete a forward reference 1622 == static void dofwd(register struct parse *p, sopno pos, sop value); 1623 */ 1624 static void 1625 dofwd(p, pos, value) 1626 register struct parse *p; 1627 register sopno pos; 1628 sop value; 1629 { 1630 /* avoid making error situations worse */ 1631 if (p->error != 0) 1632 return; 1633 1634 assert(value < 1<<OPSHIFT); 1635 p->strip[pos] = OP(p->strip[pos]) | value; 1636 } 1637 1638 /* 1639 - enlarge - enlarge the strip 1640 == static void enlarge(register struct parse *p, sopno size); 1641 */ 1642 static void 1643 enlarge(p, size) 1644 register struct parse *p; 1645 register sopno size; 1646 { 1647 register sop *sp; 1648 1649 if (p->ssize >= size) 1650 return; 1651 1652 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1653 if (sp == NULL) { 1654 SETERROR(REG_ESPACE); 1655 return; 1656 } 1657 p->strip = sp; 1658 p->ssize = size; 1659 } 1660 1661 /* 1662 - stripsnug - compact the strip 1663 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1664 */ 1665 static void 1666 stripsnug(p, g) 1667 register struct parse *p; 1668 register struct re_guts *g; 1669 { 1670 g->nstates = p->slen; 1671 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1672 if (g->strip == NULL) { 1673 SETERROR(REG_ESPACE); 1674 g->strip = p->strip; 1675 } 1676 } 1677 1678 /* 1679 - findmust - fill in must and mlen with longest mandatory literal string 1680 == static void findmust(register struct parse *p, register struct re_guts *g); 1681 * 1682 * This algorithm could do fancy things like analyzing the operands of | 1683 * for common subsequences. Someday. This code is simple and finds most 1684 * of the interesting cases. 1685 * 1686 * Note that must and mlen got initialized during setup. 1687 */ 1688 static void 1689 findmust(p, g) 1690 struct parse *p; 1691 register struct re_guts *g; 1692 { 1693 register sop *scan; 1694 sop *start; 1695 register sop *newstart; 1696 register sopno newlen; 1697 register sop s; 1698 register char *cp; 1699 register sopno i; 1700 int offset; 1701 int cs, mccs; 1702 1703 /* avoid making error situations worse */ 1704 if (p->error != 0) 1705 return; 1706 1707 /* Find out if we can handle OANYOF or not */ 1708 mccs = 0; 1709 for (cs = 0; cs < g->ncsets; cs++) 1710 if (g->sets[cs].multis != NULL) 1711 mccs = 1; 1712 1713 /* find the longest OCHAR sequence in strip */ 1714 newlen = 0; 1715 offset = 0; 1716 g->moffset = 0; 1717 scan = g->strip + 1; 1718 do { 1719 s = *scan++; 1720 switch (OP(s)) { 1721 case OCHAR: /* sequence member */ 1722 if (newlen == 0) /* new sequence */ 1723 newstart = scan - 1; 1724 newlen++; 1725 break; 1726 case OPLUS_: /* things that don't break one */ 1727 case OLPAREN: 1728 case ORPAREN: 1729 break; 1730 case OQUEST_: /* things that must be skipped */ 1731 case OCH_: 1732 offset = altoffset(scan, offset, mccs); 1733 scan--; 1734 do { 1735 scan += OPND(s); 1736 s = *scan; 1737 /* assert() interferes w debug printouts */ 1738 if (OP(s) != O_QUEST && OP(s) != O_CH && 1739 OP(s) != OOR2) { 1740 g->iflags |= BAD; 1741 return; 1742 } 1743 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1744 /* fallthrough */ 1745 case OBOW: /* things that break a sequence */ 1746 case OEOW: 1747 case OBOL: 1748 case OEOL: 1749 case O_QUEST: 1750 case O_CH: 1751 case OEND: 1752 if (newlen > g->mlen) { /* ends one */ 1753 start = newstart; 1754 g->mlen = newlen; 1755 if (offset > -1) { 1756 g->moffset += offset; 1757 offset = newlen; 1758 } else 1759 g->moffset = offset; 1760 } else { 1761 if (offset > -1) 1762 offset += newlen; 1763 } 1764 newlen = 0; 1765 break; 1766 case OANY: 1767 if (newlen > g->mlen) { /* ends one */ 1768 start = newstart; 1769 g->mlen = newlen; 1770 if (offset > -1) { 1771 g->moffset += offset; 1772 offset = newlen; 1773 } else 1774 g->moffset = offset; 1775 } else { 1776 if (offset > -1) 1777 offset += newlen; 1778 } 1779 if (offset > -1) 1780 offset++; 1781 newlen = 0; 1782 break; 1783 case OANYOF: /* may or may not invalidate offset */ 1784 /* First, everything as OANY */ 1785 if (newlen > g->mlen) { /* ends one */ 1786 start = newstart; 1787 g->mlen = newlen; 1788 if (offset > -1) { 1789 g->moffset += offset; 1790 offset = newlen; 1791 } else 1792 g->moffset = offset; 1793 } else { 1794 if (offset > -1) 1795 offset += newlen; 1796 } 1797 if (offset > -1) 1798 offset++; 1799 newlen = 0; 1800 /* And, now, if we found out we can't deal with 1801 * it, make offset = -1. 1802 */ 1803 if (mccs) 1804 offset = -1; 1805 break; 1806 default: 1807 /* Anything here makes it impossible or too hard 1808 * to calculate the offset -- so we give up; 1809 * save the last known good offset, in case the 1810 * must sequence doesn't occur later. 1811 */ 1812 if (newlen > g->mlen) { /* ends one */ 1813 start = newstart; 1814 g->mlen = newlen; 1815 if (offset > -1) 1816 g->moffset += offset; 1817 else 1818 g->moffset = offset; 1819 } 1820 offset = -1; 1821 newlen = 0; 1822 break; 1823 } 1824 } while (OP(s) != OEND); 1825 1826 if (g->mlen == 0) { /* there isn't one */ 1827 g->moffset = -1; 1828 return; 1829 } 1830 1831 /* turn it into a character string */ 1832 g->must = malloc((size_t)g->mlen + 1); 1833 if (g->must == NULL) { /* argh; just forget it */ 1834 g->mlen = 0; 1835 g->moffset = -1; 1836 return; 1837 } 1838 cp = g->must; 1839 scan = start; 1840 for (i = g->mlen; i > 0; i--) { 1841 while (OP(s = *scan++) != OCHAR) 1842 continue; 1843 assert(cp < g->must + g->mlen); 1844 *cp++ = (char)OPND(s); 1845 } 1846 assert(cp == g->must + g->mlen); 1847 *cp++ = '\0'; /* just on general principles */ 1848 } 1849 1850 /* 1851 - altoffset - choose biggest offset among multiple choices 1852 == static int altoffset(sop *scan, int offset, int mccs); 1853 * 1854 * Compute, recursively if necessary, the largest offset among multiple 1855 * re paths. 1856 */ 1857 static int 1858 altoffset(scan, offset, mccs) 1859 sop *scan; 1860 int offset; 1861 int mccs; 1862 { 1863 int largest; 1864 int try; 1865 sop s; 1866 1867 /* If we gave up already on offsets, return */ 1868 if (offset == -1) 1869 return -1; 1870 1871 largest = 0; 1872 try = 0; 1873 s = *scan++; 1874 while (OP(s) != O_QUEST && OP(s) != O_CH) { 1875 switch (OP(s)) { 1876 case OOR1: 1877 if (try > largest) 1878 largest = try; 1879 try = 0; 1880 break; 1881 case OQUEST_: 1882 case OCH_: 1883 try = altoffset(scan, try, mccs); 1884 if (try == -1) 1885 return -1; 1886 scan--; 1887 do { 1888 scan += OPND(s); 1889 s = *scan; 1890 if (OP(s) != O_QUEST && OP(s) != O_CH && 1891 OP(s) != OOR2) 1892 return -1; 1893 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1894 /* We must skip to the next position, or we'll 1895 * leave altoffset() too early. 1896 */ 1897 scan++; 1898 break; 1899 case OANYOF: 1900 if (mccs) 1901 return -1; 1902 case OCHAR: 1903 case OANY: 1904 try++; 1905 case OBOW: 1906 case OEOW: 1907 case OLPAREN: 1908 case ORPAREN: 1909 case OOR2: 1910 break; 1911 default: 1912 try = -1; 1913 break; 1914 } 1915 if (try == -1) 1916 return -1; 1917 s = *scan++; 1918 } 1919 1920 if (try > largest) 1921 largest = try; 1922 1923 return largest+offset; 1924 } 1925 1926 /* 1927 - computejumps - compute char jumps for BM scan 1928 == static void computejumps(register struct parse *p, register struct re_guts *g); 1929 * 1930 * This algorithm assumes g->must exists and is has size greater than 1931 * zero. It's based on the algorithm found on Computer Algorithms by 1932 * Sara Baase. 1933 * 1934 * A char jump is the number of characters one needs to jump based on 1935 * the value of the character from the text that was mismatched. 1936 */ 1937 static void 1938 computejumps(p, g) 1939 struct parse *p; 1940 struct re_guts *g; 1941 { 1942 int ch; 1943 int mindex; 1944 1945 /* Avoid making errors worse */ 1946 if (p->error != 0) 1947 return; 1948 1949 g->charjump = (int*) malloc((NC + 1) * sizeof(int)); 1950 if (g->charjump == NULL) /* Not a fatal error */ 1951 return; 1952 /* Adjust for signed chars, if necessary */ 1953 g->charjump = &g->charjump[-(CHAR_MIN)]; 1954 1955 /* If the character does not exist in the pattern, the jump 1956 * is equal to the number of characters in the pattern. 1957 */ 1958 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 1959 g->charjump[ch] = g->mlen; 1960 1961 /* If the character does exist, compute the jump that would 1962 * take us to the last character in the pattern equal to it 1963 * (notice that we match right to left, so that last character 1964 * is the first one that would be matched). 1965 */ 1966 for (mindex = 0; mindex < g->mlen; mindex++) 1967 g->charjump[g->must[mindex]] = g->mlen - mindex - 1; 1968 } 1969 1970 /* 1971 - computematchjumps - compute match jumps for BM scan 1972 == static void computematchjumps(register struct parse *p, register struct re_guts *g); 1973 * 1974 * This algorithm assumes g->must exists and is has size greater than 1975 * zero. It's based on the algorithm found on Computer Algorithms by 1976 * Sara Baase. 1977 * 1978 * A match jump is the number of characters one needs to advance based 1979 * on the already-matched suffix. 1980 * Notice that all values here are minus (g->mlen-1), because of the way 1981 * the search algorithm works. 1982 */ 1983 static void 1984 computematchjumps(p, g) 1985 struct parse *p; 1986 struct re_guts *g; 1987 { 1988 int mindex; /* General "must" iterator */ 1989 int suffix; /* Keeps track of matching suffix */ 1990 int ssuffix; /* Keeps track of suffixes' suffix */ 1991 int* pmatches; /* pmatches[k] points to the next i 1992 * such that i+1...mlen is a substring 1993 * of k+1...k+mlen-i-1 1994 */ 1995 1996 /* Avoid making errors worse */ 1997 if (p->error != 0) 1998 return; 1999 2000 pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); 2001 if (pmatches == NULL) { 2002 g->matchjump = NULL; 2003 return; 2004 } 2005 2006 g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); 2007 if (g->matchjump == NULL) /* Not a fatal error */ 2008 return; 2009 2010 /* Set maximum possible jump for each character in the pattern */ 2011 for (mindex = 0; mindex < g->mlen; mindex++) 2012 g->matchjump[mindex] = 2*g->mlen - mindex - 1; 2013 2014 /* Compute pmatches[] */ 2015 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 2016 mindex--, suffix--) { 2017 pmatches[mindex] = suffix; 2018 2019 /* If a mismatch is found, interrupting the substring, 2020 * compute the matchjump for that position. If no 2021 * mismatch is found, then a text substring mismatched 2022 * against the suffix will also mismatch against the 2023 * substring. 2024 */ 2025 while (suffix < g->mlen 2026 && g->must[mindex] != g->must[suffix]) { 2027 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2028 g->mlen - mindex - 1); 2029 suffix = pmatches[suffix]; 2030 } 2031 } 2032 2033 /* Compute the matchjump up to the last substring found to jump 2034 * to the beginning of the largest must pattern prefix matching 2035 * it's own suffix. 2036 */ 2037 for (mindex = 0; mindex <= suffix; mindex++) 2038 g->matchjump[mindex] = MIN(g->matchjump[mindex], 2039 g->mlen + suffix - mindex); 2040 2041 ssuffix = pmatches[suffix]; 2042 while (suffix < g->mlen) { 2043 while (suffix <= ssuffix && suffix < g->mlen) { 2044 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2045 g->mlen + ssuffix - suffix); 2046 suffix++; 2047 } 2048 ssuffix = pmatches[ssuffix]; 2049 } 2050 2051 free(pmatches); 2052 } 2053 2054 /* 2055 - pluscount - count + nesting 2056 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 2057 */ 2058 static sopno /* nesting depth */ 2059 pluscount(p, g) 2060 struct parse *p; 2061 register struct re_guts *g; 2062 { 2063 register sop *scan; 2064 register sop s; 2065 register sopno plusnest = 0; 2066 register sopno maxnest = 0; 2067 2068 if (p->error != 0) 2069 return(0); /* there may not be an OEND */ 2070 2071 scan = g->strip + 1; 2072 do { 2073 s = *scan++; 2074 switch (OP(s)) { 2075 case OPLUS_: 2076 plusnest++; 2077 break; 2078 case O_PLUS: 2079 if (plusnest > maxnest) 2080 maxnest = plusnest; 2081 plusnest--; 2082 break; 2083 } 2084 } while (OP(s) != OEND); 2085 if (plusnest != 0) 2086 g->iflags |= BAD; 2087 return(maxnest); 2088 } 2089