1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 38 */ 39 40 #if defined(LIBC_SCCS) && !defined(lint) 41 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 42 #endif /* LIBC_SCCS and not lint */ 43 44 #include <sys/types.h> 45 #include <stdio.h> 46 #include <string.h> 47 #include <ctype.h> 48 #include <limits.h> 49 #include <stdlib.h> 50 #include <regex.h> 51 52 #include "collate.h" 53 54 #include "utils.h" 55 #include "regex2.h" 56 57 #include "cclass.h" 58 #include "cname.h" 59 60 /* 61 * parse structure, passed up and down to avoid global variables and 62 * other clumsinesses 63 */ 64 struct parse { 65 char *next; /* next character in RE */ 66 char *end; /* end of string (-> NUL normally) */ 67 int error; /* has an error been seen? */ 68 sop *strip; /* malloced strip */ 69 sopno ssize; /* malloced strip size (allocated) */ 70 sopno slen; /* malloced strip length (used) */ 71 int ncsalloc; /* number of csets allocated */ 72 struct re_guts *g; 73 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 74 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 75 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 76 }; 77 78 /* ========= begin header generated by ./mkh ========= */ 79 #ifdef __cplusplus 80 extern "C" { 81 #endif 82 83 /* === regcomp.c === */ 84 static void p_ere __P((struct parse *p, int stop)); 85 static void p_ere_exp __P((struct parse *p)); 86 static void p_str __P((struct parse *p)); 87 static void p_bre __P((struct parse *p, int end1, int end2)); 88 static int p_simp_re __P((struct parse *p, int starordinary)); 89 static int p_count __P((struct parse *p)); 90 static void p_bracket __P((struct parse *p)); 91 static void p_b_term __P((struct parse *p, cset *cs)); 92 static void p_b_cclass __P((struct parse *p, cset *cs)); 93 static void p_b_eclass __P((struct parse *p, cset *cs)); 94 static char p_b_symbol __P((struct parse *p)); 95 static char p_b_coll_elem __P((struct parse *p, int endc)); 96 static char othercase __P((int ch)); 97 static void bothcases __P((struct parse *p, int ch)); 98 static void ordinary __P((struct parse *p, int ch)); 99 static void nonnewline __P((struct parse *p)); 100 static void repeat __P((struct parse *p, sopno start, int from, int to)); 101 static int seterr __P((struct parse *p, int e)); 102 static cset *allocset __P((struct parse *p)); 103 static void freeset __P((struct parse *p, cset *cs)); 104 static int freezeset __P((struct parse *p, cset *cs)); 105 static int firstch __P((struct parse *p, cset *cs)); 106 static int nch __P((struct parse *p, cset *cs)); 107 static void mcadd __P((struct parse *p, cset *cs, char *cp)); 108 #if used 109 static void mcsub __P((cset *cs, char *cp)); 110 static int mcin __P((cset *cs, char *cp)); 111 static char *mcfind __P((cset *cs, char *cp)); 112 #endif 113 static void mcinvert __P((struct parse *p, cset *cs)); 114 static void mccase __P((struct parse *p, cset *cs)); 115 static int isinsets __P((struct re_guts *g, int c)); 116 static int samesets __P((struct re_guts *g, int c1, int c2)); 117 static void categorize __P((struct parse *p, struct re_guts *g)); 118 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 119 static void doemit __P((struct parse *p, sop op, size_t opnd)); 120 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 121 static void dofwd __P((struct parse *p, sopno pos, sop value)); 122 static void enlarge __P((struct parse *p, sopno size)); 123 static void stripsnug __P((struct parse *p, struct re_guts *g)); 124 static void findmust __P((struct parse *p, struct re_guts *g)); 125 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 126 127 #ifdef __cplusplus 128 } 129 #endif 130 /* ========= end header generated by ./mkh ========= */ 131 132 static char nuls[10]; /* place to point scanner in event of error */ 133 134 /* 135 * macros for use with parse structure 136 * BEWARE: these know that the parse structure is named `p' !!! 137 */ 138 #define PEEK() (*p->next) 139 #define PEEK2() (*(p->next+1)) 140 #define MORE() (p->next < p->end) 141 #define MORE2() (p->next+1 < p->end) 142 #define SEE(c) (MORE() && PEEK() == (c)) 143 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 144 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 145 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 146 #define NEXT() (p->next++) 147 #define NEXT2() (p->next += 2) 148 #define NEXTn(n) (p->next += (n)) 149 #define GETNEXT() (*p->next++) 150 #define SETERROR(e) seterr(p, (e)) 151 #define REQUIRE(co, e) ((co) || SETERROR(e)) 152 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 153 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 154 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 155 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 156 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 157 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 158 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 159 #define HERE() (p->slen) 160 #define THERE() (p->slen - 1) 161 #define THERETHERE() (p->slen - 2) 162 #define DROP(n) (p->slen -= (n)) 163 164 #ifndef NDEBUG 165 static int never = 0; /* for use in asserts; shuts lint up */ 166 #else 167 #define never 0 /* some <assert.h>s have bugs too */ 168 #endif 169 170 /* 171 - regcomp - interface for parser and compilation 172 = extern int regcomp(regex_t *, const char *, int); 173 = #define REG_BASIC 0000 174 = #define REG_EXTENDED 0001 175 = #define REG_ICASE 0002 176 = #define REG_NOSUB 0004 177 = #define REG_NEWLINE 0010 178 = #define REG_NOSPEC 0020 179 = #define REG_PEND 0040 180 = #define REG_DUMP 0200 181 */ 182 int /* 0 success, otherwise REG_something */ 183 regcomp(preg, pattern, cflags) 184 regex_t *preg; 185 const char *pattern; 186 int cflags; 187 { 188 struct parse pa; 189 register struct re_guts *g; 190 register struct parse *p = &pa; 191 register int i; 192 register size_t len; 193 #ifdef REDEBUG 194 # define GOODFLAGS(f) (f) 195 #else 196 # define GOODFLAGS(f) ((f)&~REG_DUMP) 197 #endif 198 199 cflags = GOODFLAGS(cflags); 200 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 201 return(REG_INVARG); 202 203 if (cflags®_PEND) { 204 if (preg->re_endp < pattern) 205 return(REG_INVARG); 206 len = preg->re_endp - pattern; 207 } else 208 len = strlen((char *)pattern); 209 210 /* do the mallocs early so failure handling is easy */ 211 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 212 (NC-1)*sizeof(cat_t)); 213 if (g == NULL) 214 return(REG_ESPACE); 215 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 216 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 217 p->slen = 0; 218 if (p->strip == NULL) { 219 free((char *)g); 220 return(REG_ESPACE); 221 } 222 223 /* set things up */ 224 p->g = g; 225 p->next = (char *)pattern; /* convenience; we do not modify it */ 226 p->end = p->next + len; 227 p->error = 0; 228 p->ncsalloc = 0; 229 for (i = 0; i < NPAREN; i++) { 230 p->pbegin[i] = 0; 231 p->pend[i] = 0; 232 } 233 g->csetsize = NC; 234 g->sets = NULL; 235 g->setbits = NULL; 236 g->ncsets = 0; 237 g->cflags = cflags; 238 g->iflags = 0; 239 g->nbol = 0; 240 g->neol = 0; 241 g->must = NULL; 242 g->mlen = 0; 243 g->nsub = 0; 244 g->ncategories = 1; /* category 0 is "everything else" */ 245 g->categories = &g->catspace[-(CHAR_MIN)]; 246 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 247 g->backrefs = 0; 248 249 /* do it */ 250 EMIT(OEND, 0); 251 g->firststate = THERE(); 252 if (cflags®_EXTENDED) 253 p_ere(p, OUT); 254 else if (cflags®_NOSPEC) 255 p_str(p); 256 else 257 p_bre(p, OUT, OUT); 258 EMIT(OEND, 0); 259 g->laststate = THERE(); 260 261 /* tidy up loose ends and fill things in */ 262 categorize(p, g); 263 stripsnug(p, g); 264 findmust(p, g); 265 g->nplus = pluscount(p, g); 266 g->magic = MAGIC2; 267 preg->re_nsub = g->nsub; 268 preg->re_g = g; 269 preg->re_magic = MAGIC1; 270 #ifndef REDEBUG 271 /* not debugging, so can't rely on the assert() in regexec() */ 272 if (g->iflags&BAD) 273 SETERROR(REG_ASSERT); 274 #endif 275 276 /* win or lose, we're done */ 277 if (p->error != 0) /* lose */ 278 regfree(preg); 279 return(p->error); 280 } 281 282 /* 283 - p_ere - ERE parser top level, concatenation and alternation 284 == static void p_ere(register struct parse *p, int stop); 285 */ 286 static void 287 p_ere(p, stop) 288 register struct parse *p; 289 int stop; /* character this ERE should end at */ 290 { 291 register char c; 292 register sopno prevback; 293 register sopno prevfwd; 294 register sopno conc; 295 register int first = 1; /* is this the first alternative? */ 296 297 for (;;) { 298 /* do a bunch of concatenated expressions */ 299 conc = HERE(); 300 while (MORE() && (c = PEEK()) != '|' && c != stop) 301 p_ere_exp(p); 302 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 303 304 if (!EAT('|')) 305 break; /* NOTE BREAK OUT */ 306 307 if (first) { 308 INSERT(OCH_, conc); /* offset is wrong */ 309 prevfwd = conc; 310 prevback = conc; 311 first = 0; 312 } 313 ASTERN(OOR1, prevback); 314 prevback = THERE(); 315 AHEAD(prevfwd); /* fix previous offset */ 316 prevfwd = HERE(); 317 EMIT(OOR2, 0); /* offset is very wrong */ 318 } 319 320 if (!first) { /* tail-end fixups */ 321 AHEAD(prevfwd); 322 ASTERN(O_CH, prevback); 323 } 324 325 assert(!MORE() || SEE(stop)); 326 } 327 328 /* 329 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 330 == static void p_ere_exp(register struct parse *p); 331 */ 332 static void 333 p_ere_exp(p) 334 register struct parse *p; 335 { 336 register char c; 337 register sopno pos; 338 register int count; 339 register int count2; 340 register sopno subno; 341 int wascaret = 0; 342 343 assert(MORE()); /* caller should have ensured this */ 344 c = GETNEXT(); 345 346 pos = HERE(); 347 switch (c) { 348 case '(': 349 (void)REQUIRE(MORE(), REG_EPAREN); 350 p->g->nsub++; 351 subno = p->g->nsub; 352 if (subno < NPAREN) 353 p->pbegin[subno] = HERE(); 354 EMIT(OLPAREN, subno); 355 if (!SEE(')')) 356 p_ere(p, ')'); 357 if (subno < NPAREN) { 358 p->pend[subno] = HERE(); 359 assert(p->pend[subno] != 0); 360 } 361 EMIT(ORPAREN, subno); 362 (void)MUSTEAT(')', REG_EPAREN); 363 break; 364 #ifndef POSIX_MISTAKE 365 case ')': /* happens only if no current unmatched ( */ 366 /* 367 * You may ask, why the ifndef? Because I didn't notice 368 * this until slightly too late for 1003.2, and none of the 369 * other 1003.2 regular-expression reviewers noticed it at 370 * all. So an unmatched ) is legal POSIX, at least until 371 * we can get it fixed. 372 */ 373 SETERROR(REG_EPAREN); 374 break; 375 #endif 376 case '^': 377 EMIT(OBOL, 0); 378 p->g->iflags |= USEBOL; 379 p->g->nbol++; 380 wascaret = 1; 381 break; 382 case '$': 383 EMIT(OEOL, 0); 384 p->g->iflags |= USEEOL; 385 p->g->neol++; 386 break; 387 case '|': 388 SETERROR(REG_EMPTY); 389 break; 390 case '*': 391 case '+': 392 case '?': 393 SETERROR(REG_BADRPT); 394 break; 395 case '.': 396 if (p->g->cflags®_NEWLINE) 397 nonnewline(p); 398 else 399 EMIT(OANY, 0); 400 break; 401 case '[': 402 p_bracket(p); 403 break; 404 case '\\': 405 (void)REQUIRE(MORE(), REG_EESCAPE); 406 c = GETNEXT(); 407 ordinary(p, c); 408 break; 409 case '{': /* okay as ordinary except if digit follows */ 410 (void)REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 411 /* FALLTHROUGH */ 412 default: 413 ordinary(p, c); 414 break; 415 } 416 417 if (!MORE()) 418 return; 419 c = PEEK(); 420 /* we call { a repetition if followed by a digit */ 421 if (!( c == '*' || c == '+' || c == '?' || 422 (c == '{' && MORE2() && isdigit(PEEK2())) )) 423 return; /* no repetition, we're done */ 424 NEXT(); 425 426 (void)REQUIRE(!wascaret, REG_BADRPT); 427 switch (c) { 428 case '*': /* implemented as +? */ 429 /* this case does not require the (y|) trick, noKLUDGE */ 430 INSERT(OPLUS_, pos); 431 ASTERN(O_PLUS, pos); 432 INSERT(OQUEST_, pos); 433 ASTERN(O_QUEST, pos); 434 break; 435 case '+': 436 INSERT(OPLUS_, pos); 437 ASTERN(O_PLUS, pos); 438 break; 439 case '?': 440 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 441 INSERT(OCH_, pos); /* offset slightly wrong */ 442 ASTERN(OOR1, pos); /* this one's right */ 443 AHEAD(pos); /* fix the OCH_ */ 444 EMIT(OOR2, 0); /* offset very wrong... */ 445 AHEAD(THERE()); /* ...so fix it */ 446 ASTERN(O_CH, THERETHERE()); 447 break; 448 case '{': 449 count = p_count(p); 450 if (EAT(',')) { 451 if (isdigit(PEEK())) { 452 count2 = p_count(p); 453 (void)REQUIRE(count <= count2, REG_BADBR); 454 } else /* single number with comma */ 455 count2 = INFINITY; 456 } else /* just a single number */ 457 count2 = count; 458 repeat(p, pos, count, count2); 459 if (!EAT('}')) { /* error heuristics */ 460 while (MORE() && PEEK() != '}') 461 NEXT(); 462 (void)REQUIRE(MORE(), REG_EBRACE); 463 SETERROR(REG_BADBR); 464 } 465 break; 466 } 467 468 if (!MORE()) 469 return; 470 c = PEEK(); 471 if (!( c == '*' || c == '+' || c == '?' || 472 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 473 return; 474 SETERROR(REG_BADRPT); 475 } 476 477 /* 478 - p_str - string (no metacharacters) "parser" 479 == static void p_str(register struct parse *p); 480 */ 481 static void 482 p_str(p) 483 register struct parse *p; 484 { 485 (void)REQUIRE(MORE(), REG_EMPTY); 486 while (MORE()) 487 ordinary(p, GETNEXT()); 488 } 489 490 /* 491 - p_bre - BRE parser top level, anchoring and concatenation 492 == static void p_bre(register struct parse *p, register int end1, \ 493 == register int end2); 494 * Giving end1 as OUT essentially eliminates the end1/end2 check. 495 * 496 * This implementation is a bit of a kludge, in that a trailing $ is first 497 * taken as an ordinary character and then revised to be an anchor. The 498 * only undesirable side effect is that '$' gets included as a character 499 * category in such cases. This is fairly harmless; not worth fixing. 500 * The amount of lookahead needed to avoid this kludge is excessive. 501 */ 502 static void 503 p_bre(p, end1, end2) 504 register struct parse *p; 505 register int end1; /* first terminating character */ 506 register int end2; /* second terminating character */ 507 { 508 register sopno start = HERE(); 509 register int first = 1; /* first subexpression? */ 510 register int wasdollar = 0; 511 512 if (EAT('^')) { 513 EMIT(OBOL, 0); 514 p->g->iflags |= USEBOL; 515 p->g->nbol++; 516 } 517 while (MORE() && !SEETWO(end1, end2)) { 518 wasdollar = p_simp_re(p, first); 519 first = 0; 520 } 521 if (wasdollar) { /* oops, that was a trailing anchor */ 522 DROP(1); 523 EMIT(OEOL, 0); 524 p->g->iflags |= USEEOL; 525 p->g->neol++; 526 } 527 528 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 529 } 530 531 /* 532 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 533 == static int p_simp_re(register struct parse *p, int starordinary); 534 */ 535 static int /* was the simple RE an unbackslashed $? */ 536 p_simp_re(p, starordinary) 537 register struct parse *p; 538 int starordinary; /* is a leading * an ordinary character? */ 539 { 540 register int c; 541 register int count; 542 register int count2; 543 register sopno pos; 544 register int i; 545 register sopno subno; 546 # define BACKSL (1<<CHAR_BIT) 547 548 pos = HERE(); /* repetion op, if any, covers from here */ 549 550 assert(MORE()); /* caller should have ensured this */ 551 c = GETNEXT(); 552 if (c == '\\') { 553 (void)REQUIRE(MORE(), REG_EESCAPE); 554 c = BACKSL | (unsigned char)GETNEXT(); 555 } 556 switch (c) { 557 case '.': 558 if (p->g->cflags®_NEWLINE) 559 nonnewline(p); 560 else 561 EMIT(OANY, 0); 562 break; 563 case '[': 564 p_bracket(p); 565 break; 566 case BACKSL|'{': 567 SETERROR(REG_BADRPT); 568 break; 569 case BACKSL|'(': 570 p->g->nsub++; 571 subno = p->g->nsub; 572 if (subno < NPAREN) 573 p->pbegin[subno] = HERE(); 574 EMIT(OLPAREN, subno); 575 /* the MORE here is an error heuristic */ 576 if (MORE() && !SEETWO('\\', ')')) 577 p_bre(p, '\\', ')'); 578 if (subno < NPAREN) { 579 p->pend[subno] = HERE(); 580 assert(p->pend[subno] != 0); 581 } 582 EMIT(ORPAREN, subno); 583 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 584 break; 585 case BACKSL|')': /* should not get here -- must be user */ 586 case BACKSL|'}': 587 SETERROR(REG_EPAREN); 588 break; 589 case BACKSL|'1': 590 case BACKSL|'2': 591 case BACKSL|'3': 592 case BACKSL|'4': 593 case BACKSL|'5': 594 case BACKSL|'6': 595 case BACKSL|'7': 596 case BACKSL|'8': 597 case BACKSL|'9': 598 i = (c&~BACKSL) - '0'; 599 assert(i < NPAREN); 600 if (p->pend[i] != 0) { 601 assert(i <= p->g->nsub); 602 EMIT(OBACK_, i); 603 assert(p->pbegin[i] != 0); 604 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 605 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 606 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 607 EMIT(O_BACK, i); 608 } else 609 SETERROR(REG_ESUBREG); 610 p->g->backrefs = 1; 611 break; 612 case '*': 613 (void)REQUIRE(starordinary, REG_BADRPT); 614 /* FALLTHROUGH */ 615 default: 616 ordinary(p, c &~ BACKSL); 617 break; 618 } 619 620 if (EAT('*')) { /* implemented as +? */ 621 /* this case does not require the (y|) trick, noKLUDGE */ 622 INSERT(OPLUS_, pos); 623 ASTERN(O_PLUS, pos); 624 INSERT(OQUEST_, pos); 625 ASTERN(O_QUEST, pos); 626 } else if (EATTWO('\\', '{')) { 627 count = p_count(p); 628 if (EAT(',')) { 629 if (MORE() && isdigit(PEEK())) { 630 count2 = p_count(p); 631 (void)REQUIRE(count <= count2, REG_BADBR); 632 } else /* single number with comma */ 633 count2 = INFINITY; 634 } else /* just a single number */ 635 count2 = count; 636 repeat(p, pos, count, count2); 637 if (!EATTWO('\\', '}')) { /* error heuristics */ 638 while (MORE() && !SEETWO('\\', '}')) 639 NEXT(); 640 (void)REQUIRE(MORE(), REG_EBRACE); 641 SETERROR(REG_BADBR); 642 } 643 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 644 return(1); 645 646 return(0); 647 } 648 649 /* 650 - p_count - parse a repetition count 651 == static int p_count(register struct parse *p); 652 */ 653 static int /* the value */ 654 p_count(p) 655 register struct parse *p; 656 { 657 register int count = 0; 658 register int ndigits = 0; 659 660 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 661 count = count*10 + (GETNEXT() - '0'); 662 ndigits++; 663 } 664 665 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 666 return(count); 667 } 668 669 /* 670 - p_bracket - parse a bracketed character list 671 == static void p_bracket(register struct parse *p); 672 * 673 * Note a significant property of this code: if the allocset() did SETERROR, 674 * no set operations are done. 675 */ 676 static void 677 p_bracket(p) 678 register struct parse *p; 679 { 680 register cset *cs = allocset(p); 681 register int invert = 0; 682 683 /* Dept of Truly Sickening Special-Case Kludges */ 684 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 685 EMIT(OBOW, 0); 686 NEXTn(6); 687 return; 688 } 689 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 690 EMIT(OEOW, 0); 691 NEXTn(6); 692 return; 693 } 694 695 if (EAT('^')) 696 invert++; /* make note to invert set at end */ 697 if (EAT(']')) 698 CHadd(cs, ']'); 699 else if (EAT('-')) 700 CHadd(cs, '-'); 701 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 702 p_b_term(p, cs); 703 if (EAT('-')) 704 CHadd(cs, '-'); 705 (void)MUSTEAT(']', REG_EBRACK); 706 707 if (p->error != 0) /* don't mess things up further */ 708 return; 709 710 if (p->g->cflags®_ICASE) { 711 register int i; 712 register int ci; 713 714 for (i = p->g->csetsize - 1; i >= 0; i--) 715 if (CHIN(cs, i) && isalpha(i)) { 716 ci = othercase(i); 717 if (ci != i) 718 CHadd(cs, ci); 719 } 720 if (cs->multis != NULL) 721 mccase(p, cs); 722 } 723 if (invert) { 724 register int i; 725 726 for (i = p->g->csetsize - 1; i >= 0; i--) 727 if (CHIN(cs, i)) 728 CHsub(cs, i); 729 else 730 CHadd(cs, i); 731 if (p->g->cflags®_NEWLINE) 732 CHsub(cs, '\n'); 733 if (cs->multis != NULL) 734 mcinvert(p, cs); 735 } 736 737 assert(cs->multis == NULL); /* xxx */ 738 739 if (nch(p, cs) == 1) { /* optimize singleton sets */ 740 ordinary(p, firstch(p, cs)); 741 freeset(p, cs); 742 } else 743 EMIT(OANYOF, freezeset(p, cs)); 744 } 745 746 /* 747 - p_b_term - parse one term of a bracketed character list 748 == static void p_b_term(register struct parse *p, register cset *cs); 749 */ 750 static void 751 p_b_term(p, cs) 752 register struct parse *p; 753 register cset *cs; 754 { 755 register char c; 756 register char start, finish; 757 register int i; 758 759 /* classify what we've got */ 760 switch ((MORE()) ? PEEK() : '\0') { 761 case '[': 762 c = (MORE2()) ? PEEK2() : '\0'; 763 break; 764 case '-': 765 SETERROR(REG_ERANGE); 766 return; /* NOTE RETURN */ 767 break; 768 default: 769 c = '\0'; 770 break; 771 } 772 773 switch (c) { 774 case ':': /* character class */ 775 NEXT2(); 776 (void)REQUIRE(MORE(), REG_EBRACK); 777 c = PEEK(); 778 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 779 p_b_cclass(p, cs); 780 (void)REQUIRE(MORE(), REG_EBRACK); 781 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 782 break; 783 case '=': /* equivalence class */ 784 NEXT2(); 785 (void)REQUIRE(MORE(), REG_EBRACK); 786 c = PEEK(); 787 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 788 p_b_eclass(p, cs); 789 (void)REQUIRE(MORE(), REG_EBRACK); 790 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 791 break; 792 default: /* symbol, ordinary character, or range */ 793 /* xxx revision needed for multichar stuff */ 794 start = p_b_symbol(p); 795 if (SEE('-') && MORE2() && PEEK2() != ']') { 796 /* range */ 797 NEXT(); 798 if (EAT('-')) 799 finish = '-'; 800 else 801 finish = p_b_symbol(p); 802 } else 803 finish = start; 804 if (start == finish) 805 CHadd(cs, start); 806 else { 807 if (__collate_load_error) { 808 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); 809 for (i = (uch)start; i <= (uch)finish; i++) 810 CHadd(cs, i); 811 } else { 812 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); 813 for (i = CHAR_MIN; i <= CHAR_MAX; i++) { 814 if ( __collate_range_cmp(start, i) <= 0 815 && __collate_range_cmp(i, finish) <= 0 816 ) 817 CHadd(cs, i); 818 } 819 } 820 } 821 break; 822 } 823 } 824 825 /* 826 - p_b_cclass - parse a character-class name and deal with it 827 == static void p_b_cclass(register struct parse *p, register cset *cs); 828 */ 829 static void 830 p_b_cclass(p, cs) 831 register struct parse *p; 832 register cset *cs; 833 { 834 register int c; 835 register char *sp = p->next; 836 register struct cclass *cp; 837 register size_t len; 838 839 while (MORE() && isalpha((uch)PEEK())) 840 NEXT(); 841 len = p->next - sp; 842 for (cp = cclasses; cp->name != NULL; cp++) 843 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 844 break; 845 if (cp->name == NULL) { 846 /* oops, didn't find it */ 847 SETERROR(REG_ECTYPE); 848 return; 849 } 850 851 switch (cp->fidx) { 852 case CALNUM: 853 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 854 if (isalnum((uch)c)) 855 CHadd(cs, c); 856 break; 857 case CALPHA: 858 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 859 if (isalpha((uch)c)) 860 CHadd(cs, c); 861 break; 862 case CBLANK: 863 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 864 if (isblank((uch)c)) 865 CHadd(cs, c); 866 break; 867 case CCNTRL: 868 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 869 if (iscntrl((uch)c)) 870 CHadd(cs, c); 871 break; 872 case CDIGIT: 873 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 874 if (isdigit((uch)c)) 875 CHadd(cs, c); 876 break; 877 case CGRAPH: 878 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 879 if (isgraph((uch)c)) 880 CHadd(cs, c); 881 break; 882 case CLOWER: 883 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 884 if (islower((uch)c)) 885 CHadd(cs, c); 886 break; 887 case CPRINT: 888 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 889 if (isprint((uch)c)) 890 CHadd(cs, c); 891 break; 892 case CPUNCT: 893 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 894 if (ispunct((uch)c)) 895 CHadd(cs, c); 896 break; 897 case CSPACE: 898 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 899 if (isspace((uch)c)) 900 CHadd(cs, c); 901 break; 902 case CUPPER: 903 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 904 if (isupper((uch)c)) 905 CHadd(cs, c); 906 break; 907 case CXDIGIT: 908 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 909 if (isxdigit((uch)c)) 910 CHadd(cs, c); 911 break; 912 } 913 #if 0 914 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 915 MCadd(p, cs, u); 916 #endif 917 } 918 919 /* 920 - p_b_eclass - parse an equivalence-class name and deal with it 921 == static void p_b_eclass(register struct parse *p, register cset *cs); 922 * 923 * This implementation is incomplete. xxx 924 */ 925 static void 926 p_b_eclass(p, cs) 927 register struct parse *p; 928 register cset *cs; 929 { 930 register char c; 931 932 c = p_b_coll_elem(p, '='); 933 CHadd(cs, c); 934 } 935 936 /* 937 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 938 == static char p_b_symbol(register struct parse *p); 939 */ 940 static char /* value of symbol */ 941 p_b_symbol(p) 942 register struct parse *p; 943 { 944 register char value; 945 946 (void)REQUIRE(MORE(), REG_EBRACK); 947 if (!EATTWO('[', '.')) 948 return(GETNEXT()); 949 950 /* collating symbol */ 951 value = p_b_coll_elem(p, '.'); 952 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 953 return(value); 954 } 955 956 /* 957 - p_b_coll_elem - parse a collating-element name and look it up 958 == static char p_b_coll_elem(register struct parse *p, int endc); 959 */ 960 static char /* value of collating element */ 961 p_b_coll_elem(p, endc) 962 register struct parse *p; 963 int endc; /* name ended by endc,']' */ 964 { 965 register char *sp = p->next; 966 register struct cname *cp; 967 register int len; 968 969 while (MORE() && !SEETWO(endc, ']')) 970 NEXT(); 971 if (!MORE()) { 972 SETERROR(REG_EBRACK); 973 return(0); 974 } 975 len = p->next - sp; 976 for (cp = cnames; cp->name != NULL; cp++) 977 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 978 return(cp->code); /* known name */ 979 if (len == 1) 980 return(*sp); /* single character */ 981 SETERROR(REG_ECOLLATE); /* neither */ 982 return(0); 983 } 984 985 /* 986 - othercase - return the case counterpart of an alphabetic 987 == static char othercase(int ch); 988 */ 989 static char /* if no counterpart, return ch */ 990 othercase(ch) 991 int ch; 992 { 993 ch = (unsigned char)ch; 994 assert(isalpha(ch)); 995 if (isupper(ch)) 996 return(tolower(ch)); 997 else if (islower(ch)) 998 return(toupper(ch)); 999 else /* peculiar, but could happen */ 1000 return(ch); 1001 } 1002 1003 /* 1004 - bothcases - emit a dualcase version of a two-case character 1005 == static void bothcases(register struct parse *p, int ch); 1006 * 1007 * Boy, is this implementation ever a kludge... 1008 */ 1009 static void 1010 bothcases(p, ch) 1011 register struct parse *p; 1012 int ch; 1013 { 1014 register char *oldnext = p->next; 1015 register char *oldend = p->end; 1016 char bracket[3]; 1017 1018 ch = (unsigned char)ch; 1019 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1020 p->next = bracket; 1021 p->end = bracket+2; 1022 bracket[0] = ch; 1023 bracket[1] = ']'; 1024 bracket[2] = '\0'; 1025 p_bracket(p); 1026 assert(p->next == bracket+2); 1027 p->next = oldnext; 1028 p->end = oldend; 1029 } 1030 1031 /* 1032 - ordinary - emit an ordinary character 1033 == static void ordinary(register struct parse *p, register int ch); 1034 */ 1035 static void 1036 ordinary(p, ch) 1037 register struct parse *p; 1038 register int ch; 1039 { 1040 register cat_t *cap = p->g->categories; 1041 1042 if ((p->g->cflags®_ICASE) && isalpha((unsigned char)ch) && othercase(ch) != ch) 1043 bothcases(p, ch); 1044 else { 1045 EMIT(OCHAR, (unsigned char)ch); 1046 if (cap[ch] == 0) 1047 cap[ch] = p->g->ncategories++; 1048 } 1049 } 1050 1051 /* 1052 - nonnewline - emit REG_NEWLINE version of OANY 1053 == static void nonnewline(register struct parse *p); 1054 * 1055 * Boy, is this implementation ever a kludge... 1056 */ 1057 static void 1058 nonnewline(p) 1059 register struct parse *p; 1060 { 1061 register char *oldnext = p->next; 1062 register char *oldend = p->end; 1063 char bracket[4]; 1064 1065 p->next = bracket; 1066 p->end = bracket+3; 1067 bracket[0] = '^'; 1068 bracket[1] = '\n'; 1069 bracket[2] = ']'; 1070 bracket[3] = '\0'; 1071 p_bracket(p); 1072 assert(p->next == bracket+3); 1073 p->next = oldnext; 1074 p->end = oldend; 1075 } 1076 1077 /* 1078 - repeat - generate code for a bounded repetition, recursively if needed 1079 == static void repeat(register struct parse *p, sopno start, int from, int to); 1080 */ 1081 static void 1082 repeat(p, start, from, to) 1083 register struct parse *p; 1084 sopno start; /* operand from here to end of strip */ 1085 int from; /* repeated from this number */ 1086 int to; /* to this number of times (maybe INFINITY) */ 1087 { 1088 register sopno finish = HERE(); 1089 # define N 2 1090 # define INF 3 1091 # define REP(f, t) ((f)*8 + (t)) 1092 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1093 register sopno copy; 1094 1095 if (p->error != 0) /* head off possible runaway recursion */ 1096 return; 1097 1098 assert(from <= to); 1099 1100 switch (REP(MAP(from), MAP(to))) { 1101 case REP(0, 0): /* must be user doing this */ 1102 DROP(finish-start); /* drop the operand */ 1103 break; 1104 case REP(0, 1): /* as x{1,1}? */ 1105 case REP(0, N): /* as x{1,n}? */ 1106 case REP(0, INF): /* as x{1,}? */ 1107 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1108 INSERT(OCH_, start); /* offset is wrong... */ 1109 repeat(p, start+1, 1, to); 1110 ASTERN(OOR1, start); 1111 AHEAD(start); /* ... fix it */ 1112 EMIT(OOR2, 0); 1113 AHEAD(THERE()); 1114 ASTERN(O_CH, THERETHERE()); 1115 break; 1116 case REP(1, 1): /* trivial case */ 1117 /* done */ 1118 break; 1119 case REP(1, N): /* as x?x{1,n-1} */ 1120 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1121 INSERT(OCH_, start); 1122 ASTERN(OOR1, start); 1123 AHEAD(start); 1124 EMIT(OOR2, 0); /* offset very wrong... */ 1125 AHEAD(THERE()); /* ...so fix it */ 1126 ASTERN(O_CH, THERETHERE()); 1127 copy = dupl(p, start+1, finish+1); 1128 assert(copy == finish+4); 1129 repeat(p, copy, 1, to-1); 1130 break; 1131 case REP(1, INF): /* as x+ */ 1132 INSERT(OPLUS_, start); 1133 ASTERN(O_PLUS, start); 1134 break; 1135 case REP(N, N): /* as xx{m-1,n-1} */ 1136 copy = dupl(p, start, finish); 1137 repeat(p, copy, from-1, to-1); 1138 break; 1139 case REP(N, INF): /* as xx{n-1,INF} */ 1140 copy = dupl(p, start, finish); 1141 repeat(p, copy, from-1, to); 1142 break; 1143 default: /* "can't happen" */ 1144 SETERROR(REG_ASSERT); /* just in case */ 1145 break; 1146 } 1147 } 1148 1149 /* 1150 - seterr - set an error condition 1151 == static int seterr(register struct parse *p, int e); 1152 */ 1153 static int /* useless but makes type checking happy */ 1154 seterr(p, e) 1155 register struct parse *p; 1156 int e; 1157 { 1158 if (p->error == 0) /* keep earliest error condition */ 1159 p->error = e; 1160 p->next = nuls; /* try to bring things to a halt */ 1161 p->end = nuls; 1162 return(0); /* make the return value well-defined */ 1163 } 1164 1165 /* 1166 - allocset - allocate a set of characters for [] 1167 == static cset *allocset(register struct parse *p); 1168 */ 1169 static cset * 1170 allocset(p) 1171 register struct parse *p; 1172 { 1173 register int no = p->g->ncsets++; 1174 register size_t nc; 1175 register size_t nbytes; 1176 register cset *cs; 1177 register size_t css = (size_t)p->g->csetsize; 1178 register int i; 1179 1180 if (no >= p->ncsalloc) { /* need another column of space */ 1181 p->ncsalloc += CHAR_BIT; 1182 nc = p->ncsalloc; 1183 assert(nc % CHAR_BIT == 0); 1184 nbytes = nc / CHAR_BIT * css; 1185 if (p->g->sets == NULL) 1186 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1187 else 1188 p->g->sets = (cset *)reallocf((char *)p->g->sets, 1189 nc * sizeof(cset)); 1190 if (p->g->setbits == NULL) 1191 p->g->setbits = (uch *)malloc(nbytes); 1192 else { 1193 p->g->setbits = (uch *)reallocf((char *)p->g->setbits, 1194 nbytes); 1195 /* xxx this isn't right if setbits is now NULL */ 1196 for (i = 0; i < no; i++) 1197 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1198 } 1199 if (p->g->sets != NULL && p->g->setbits != NULL) 1200 (void) memset((char *)p->g->setbits + (nbytes - css), 1201 0, css); 1202 else { 1203 no = 0; 1204 SETERROR(REG_ESPACE); 1205 /* caller's responsibility not to do set ops */ 1206 } 1207 } 1208 1209 assert(p->g->sets != NULL); /* xxx */ 1210 cs = &p->g->sets[no]; 1211 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1212 cs->mask = 1 << ((no) % CHAR_BIT); 1213 cs->hash = 0; 1214 cs->smultis = 0; 1215 cs->multis = NULL; 1216 1217 return(cs); 1218 } 1219 1220 /* 1221 - freeset - free a now-unused set 1222 == static void freeset(register struct parse *p, register cset *cs); 1223 */ 1224 static void 1225 freeset(p, cs) 1226 register struct parse *p; 1227 register cset *cs; 1228 { 1229 register int i; 1230 register cset *top = &p->g->sets[p->g->ncsets]; 1231 register size_t css = (size_t)p->g->csetsize; 1232 1233 for (i = 0; i < css; i++) 1234 CHsub(cs, i); 1235 if (cs == top-1) /* recover only the easy case */ 1236 p->g->ncsets--; 1237 } 1238 1239 /* 1240 - freezeset - final processing on a set of characters 1241 == static int freezeset(register struct parse *p, register cset *cs); 1242 * 1243 * The main task here is merging identical sets. This is usually a waste 1244 * of time (although the hash code minimizes the overhead), but can win 1245 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1246 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1247 * the same value! 1248 */ 1249 static int /* set number */ 1250 freezeset(p, cs) 1251 register struct parse *p; 1252 register cset *cs; 1253 { 1254 register short h = cs->hash; 1255 register int i; 1256 register cset *top = &p->g->sets[p->g->ncsets]; 1257 register cset *cs2; 1258 register size_t css = (size_t)p->g->csetsize; 1259 1260 /* look for an earlier one which is the same */ 1261 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1262 if (cs2->hash == h && cs2 != cs) { 1263 /* maybe */ 1264 for (i = 0; i < css; i++) 1265 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1266 break; /* no */ 1267 if (i == css) 1268 break; /* yes */ 1269 } 1270 1271 if (cs2 < top) { /* found one */ 1272 freeset(p, cs); 1273 cs = cs2; 1274 } 1275 1276 return((int)(cs - p->g->sets)); 1277 } 1278 1279 /* 1280 - firstch - return first character in a set (which must have at least one) 1281 == static int firstch(register struct parse *p, register cset *cs); 1282 */ 1283 static int /* character; there is no "none" value */ 1284 firstch(p, cs) 1285 register struct parse *p; 1286 register cset *cs; 1287 { 1288 register int i; 1289 register size_t css = (size_t)p->g->csetsize; 1290 1291 for (i = 0; i < css; i++) 1292 if (CHIN(cs, i)) 1293 return((unsigned char)i); 1294 assert(never); 1295 return(0); /* arbitrary */ 1296 } 1297 1298 /* 1299 - nch - number of characters in a set 1300 == static int nch(register struct parse *p, register cset *cs); 1301 */ 1302 static int 1303 nch(p, cs) 1304 register struct parse *p; 1305 register cset *cs; 1306 { 1307 register int i; 1308 register size_t css = (size_t)p->g->csetsize; 1309 register int n = 0; 1310 1311 for (i = 0; i < css; i++) 1312 if (CHIN(cs, i)) 1313 n++; 1314 return(n); 1315 } 1316 1317 /* 1318 - mcadd - add a collating element to a cset 1319 == static void mcadd(register struct parse *p, register cset *cs, \ 1320 == register char *cp); 1321 */ 1322 static void 1323 mcadd(p, cs, cp) 1324 register struct parse *p; 1325 register cset *cs; 1326 register char *cp; 1327 { 1328 register size_t oldend = cs->smultis; 1329 1330 cs->smultis += strlen(cp) + 1; 1331 if (cs->multis == NULL) 1332 cs->multis = malloc(cs->smultis); 1333 else 1334 cs->multis = reallocf(cs->multis, cs->smultis); 1335 if (cs->multis == NULL) { 1336 SETERROR(REG_ESPACE); 1337 return; 1338 } 1339 1340 (void) strcpy(cs->multis + oldend - 1, cp); 1341 cs->multis[cs->smultis - 1] = '\0'; 1342 } 1343 1344 #if used 1345 /* 1346 - mcsub - subtract a collating element from a cset 1347 == static void mcsub(register cset *cs, register char *cp); 1348 */ 1349 static void 1350 mcsub(cs, cp) 1351 register cset *cs; 1352 register char *cp; 1353 { 1354 register char *fp = mcfind(cs, cp); 1355 register size_t len = strlen(fp); 1356 1357 assert(fp != NULL); 1358 (void) memmove(fp, fp + len + 1, 1359 cs->smultis - (fp + len + 1 - cs->multis)); 1360 cs->smultis -= len; 1361 1362 if (cs->smultis == 0) { 1363 free(cs->multis); 1364 cs->multis = NULL; 1365 return; 1366 } 1367 1368 cs->multis = reallocf(cs->multis, cs->smultis); 1369 assert(cs->multis != NULL); 1370 } 1371 1372 /* 1373 - mcin - is a collating element in a cset? 1374 == static int mcin(register cset *cs, register char *cp); 1375 */ 1376 static int 1377 mcin(cs, cp) 1378 register cset *cs; 1379 register char *cp; 1380 { 1381 return(mcfind(cs, cp) != NULL); 1382 } 1383 1384 /* 1385 - mcfind - find a collating element in a cset 1386 == static char *mcfind(register cset *cs, register char *cp); 1387 */ 1388 static char * 1389 mcfind(cs, cp) 1390 register cset *cs; 1391 register char *cp; 1392 { 1393 register char *p; 1394 1395 if (cs->multis == NULL) 1396 return(NULL); 1397 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1398 if (strcmp(cp, p) == 0) 1399 return(p); 1400 return(NULL); 1401 } 1402 #endif 1403 1404 /* 1405 - mcinvert - invert the list of collating elements in a cset 1406 == static void mcinvert(register struct parse *p, register cset *cs); 1407 * 1408 * This would have to know the set of possibilities. Implementation 1409 * is deferred. 1410 */ 1411 static void 1412 mcinvert(p, cs) 1413 register struct parse *p; 1414 register cset *cs; 1415 { 1416 assert(cs->multis == NULL); /* xxx */ 1417 } 1418 1419 /* 1420 - mccase - add case counterparts of the list of collating elements in a cset 1421 == static void mccase(register struct parse *p, register cset *cs); 1422 * 1423 * This would have to know the set of possibilities. Implementation 1424 * is deferred. 1425 */ 1426 static void 1427 mccase(p, cs) 1428 register struct parse *p; 1429 register cset *cs; 1430 { 1431 assert(cs->multis == NULL); /* xxx */ 1432 } 1433 1434 /* 1435 - isinsets - is this character in any sets? 1436 == static int isinsets(register struct re_guts *g, int c); 1437 */ 1438 static int /* predicate */ 1439 isinsets(g, c) 1440 register struct re_guts *g; 1441 int c; 1442 { 1443 register uch *col; 1444 register int i; 1445 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1446 register unsigned uc = (unsigned char)c; 1447 1448 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1449 if (col[uc] != 0) 1450 return(1); 1451 return(0); 1452 } 1453 1454 /* 1455 - samesets - are these two characters in exactly the same sets? 1456 == static int samesets(register struct re_guts *g, int c1, int c2); 1457 */ 1458 static int /* predicate */ 1459 samesets(g, c1, c2) 1460 register struct re_guts *g; 1461 int c1; 1462 int c2; 1463 { 1464 register uch *col; 1465 register int i; 1466 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1467 register unsigned uc1 = (unsigned char)c1; 1468 register unsigned uc2 = (unsigned char)c2; 1469 1470 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1471 if (col[uc1] != col[uc2]) 1472 return(0); 1473 return(1); 1474 } 1475 1476 /* 1477 - categorize - sort out character categories 1478 == static void categorize(struct parse *p, register struct re_guts *g); 1479 */ 1480 static void 1481 categorize(p, g) 1482 struct parse *p; 1483 register struct re_guts *g; 1484 { 1485 register cat_t *cats = g->categories; 1486 register int c; 1487 register int c2; 1488 register cat_t cat; 1489 1490 /* avoid making error situations worse */ 1491 if (p->error != 0) 1492 return; 1493 1494 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1495 if (cats[c] == 0 && isinsets(g, c)) { 1496 cat = g->ncategories++; 1497 cats[c] = cat; 1498 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1499 if (cats[c2] == 0 && samesets(g, c, c2)) 1500 cats[c2] = cat; 1501 } 1502 } 1503 1504 /* 1505 - dupl - emit a duplicate of a bunch of sops 1506 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1507 */ 1508 static sopno /* start of duplicate */ 1509 dupl(p, start, finish) 1510 register struct parse *p; 1511 sopno start; /* from here */ 1512 sopno finish; /* to this less one */ 1513 { 1514 register sopno ret = HERE(); 1515 register sopno len = finish - start; 1516 1517 assert(finish >= start); 1518 if (len == 0) 1519 return(ret); 1520 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1521 assert(p->ssize >= p->slen + len); 1522 (void) memcpy((char *)(p->strip + p->slen), 1523 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1524 p->slen += len; 1525 return(ret); 1526 } 1527 1528 /* 1529 - doemit - emit a strip operator 1530 == static void doemit(register struct parse *p, sop op, size_t opnd); 1531 * 1532 * It might seem better to implement this as a macro with a function as 1533 * hard-case backup, but it's just too big and messy unless there are 1534 * some changes to the data structures. Maybe later. 1535 */ 1536 static void 1537 doemit(p, op, opnd) 1538 register struct parse *p; 1539 sop op; 1540 size_t opnd; 1541 { 1542 /* avoid making error situations worse */ 1543 if (p->error != 0) 1544 return; 1545 1546 /* deal with oversize operands ("can't happen", more or less) */ 1547 assert(opnd < 1<<OPSHIFT); 1548 1549 /* deal with undersized strip */ 1550 if (p->slen >= p->ssize) 1551 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1552 assert(p->slen < p->ssize); 1553 1554 /* finally, it's all reduced to the easy case */ 1555 p->strip[p->slen++] = SOP(op, opnd); 1556 } 1557 1558 /* 1559 - doinsert - insert a sop into the strip 1560 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1561 */ 1562 static void 1563 doinsert(p, op, opnd, pos) 1564 register struct parse *p; 1565 sop op; 1566 size_t opnd; 1567 sopno pos; 1568 { 1569 register sopno sn; 1570 register sop s; 1571 register int i; 1572 1573 /* avoid making error situations worse */ 1574 if (p->error != 0) 1575 return; 1576 1577 sn = HERE(); 1578 EMIT(op, opnd); /* do checks, ensure space */ 1579 assert(HERE() == sn+1); 1580 s = p->strip[sn]; 1581 1582 /* adjust paren pointers */ 1583 assert(pos > 0); 1584 for (i = 1; i < NPAREN; i++) { 1585 if (p->pbegin[i] >= pos) { 1586 p->pbegin[i]++; 1587 } 1588 if (p->pend[i] >= pos) { 1589 p->pend[i]++; 1590 } 1591 } 1592 1593 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1594 (HERE()-pos-1)*sizeof(sop)); 1595 p->strip[pos] = s; 1596 } 1597 1598 /* 1599 - dofwd - complete a forward reference 1600 == static void dofwd(register struct parse *p, sopno pos, sop value); 1601 */ 1602 static void 1603 dofwd(p, pos, value) 1604 register struct parse *p; 1605 register sopno pos; 1606 sop value; 1607 { 1608 /* avoid making error situations worse */ 1609 if (p->error != 0) 1610 return; 1611 1612 assert(value < 1<<OPSHIFT); 1613 p->strip[pos] = OP(p->strip[pos]) | value; 1614 } 1615 1616 /* 1617 - enlarge - enlarge the strip 1618 == static void enlarge(register struct parse *p, sopno size); 1619 */ 1620 static void 1621 enlarge(p, size) 1622 register struct parse *p; 1623 register sopno size; 1624 { 1625 register sop *sp; 1626 1627 if (p->ssize >= size) 1628 return; 1629 1630 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1631 if (sp == NULL) { 1632 SETERROR(REG_ESPACE); 1633 return; 1634 } 1635 p->strip = sp; 1636 p->ssize = size; 1637 } 1638 1639 /* 1640 - stripsnug - compact the strip 1641 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1642 */ 1643 static void 1644 stripsnug(p, g) 1645 register struct parse *p; 1646 register struct re_guts *g; 1647 { 1648 g->nstates = p->slen; 1649 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1650 if (g->strip == NULL) { 1651 SETERROR(REG_ESPACE); 1652 g->strip = p->strip; 1653 } 1654 } 1655 1656 /* 1657 - findmust - fill in must and mlen with longest mandatory literal string 1658 == static void findmust(register struct parse *p, register struct re_guts *g); 1659 * 1660 * This algorithm could do fancy things like analyzing the operands of | 1661 * for common subsequences. Someday. This code is simple and finds most 1662 * of the interesting cases. 1663 * 1664 * Note that must and mlen got initialized during setup. 1665 */ 1666 static void 1667 findmust(p, g) 1668 struct parse *p; 1669 register struct re_guts *g; 1670 { 1671 register sop *scan; 1672 sop *start; 1673 register sop *newstart; 1674 register sopno newlen; 1675 register sop s; 1676 register char *cp; 1677 register sopno i; 1678 1679 /* avoid making error situations worse */ 1680 if (p->error != 0) 1681 return; 1682 1683 /* find the longest OCHAR sequence in strip */ 1684 newlen = 0; 1685 scan = g->strip + 1; 1686 do { 1687 s = *scan++; 1688 switch (OP(s)) { 1689 case OCHAR: /* sequence member */ 1690 if (newlen == 0) /* new sequence */ 1691 newstart = scan - 1; 1692 newlen++; 1693 break; 1694 case OPLUS_: /* things that don't break one */ 1695 case OLPAREN: 1696 case ORPAREN: 1697 break; 1698 case OQUEST_: /* things that must be skipped */ 1699 case OCH_: 1700 scan--; 1701 do { 1702 scan += OPND(s); 1703 s = *scan; 1704 /* assert() interferes w debug printouts */ 1705 if (OP(s) != O_QUEST && OP(s) != O_CH && 1706 OP(s) != OOR2) { 1707 g->iflags |= BAD; 1708 return; 1709 } 1710 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1711 /* fallthrough */ 1712 default: /* things that break a sequence */ 1713 if (newlen > g->mlen) { /* ends one */ 1714 start = newstart; 1715 g->mlen = newlen; 1716 } 1717 newlen = 0; 1718 break; 1719 } 1720 } while (OP(s) != OEND); 1721 1722 if (g->mlen == 0) /* there isn't one */ 1723 return; 1724 1725 /* turn it into a character string */ 1726 g->must = malloc((size_t)g->mlen + 1); 1727 if (g->must == NULL) { /* argh; just forget it */ 1728 g->mlen = 0; 1729 return; 1730 } 1731 cp = g->must; 1732 scan = start; 1733 for (i = g->mlen; i > 0; i--) { 1734 while (OP(s = *scan++) != OCHAR) 1735 continue; 1736 assert(cp < g->must + g->mlen); 1737 *cp++ = (char)OPND(s); 1738 } 1739 assert(cp == g->must + g->mlen); 1740 *cp++ = '\0'; /* just on general principles */ 1741 } 1742 1743 /* 1744 - pluscount - count + nesting 1745 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1746 */ 1747 static sopno /* nesting depth */ 1748 pluscount(p, g) 1749 struct parse *p; 1750 register struct re_guts *g; 1751 { 1752 register sop *scan; 1753 register sop s; 1754 register sopno plusnest = 0; 1755 register sopno maxnest = 0; 1756 1757 if (p->error != 0) 1758 return(0); /* there may not be an OEND */ 1759 1760 scan = g->strip + 1; 1761 do { 1762 s = *scan++; 1763 switch (OP(s)) { 1764 case OPLUS_: 1765 plusnest++; 1766 break; 1767 case O_PLUS: 1768 if (plusnest > maxnest) 1769 maxnest = plusnest; 1770 plusnest--; 1771 break; 1772 } 1773 } while (OP(s) != OEND); 1774 if (plusnest != 0) 1775 g->iflags |= BAD; 1776 return(maxnest); 1777 } 1778