1 /* $NetBSD: regcomp.c,v 1.7 2011/11/19 17:45:11 tnozaki Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer of the University of Toronto. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * @(#)regcomp.c 8.4 (Berkeley) 3/19/94 40 */ 41 42 #if defined(LIBC_SCCS) && !defined(lint) 43 static char sccsid[] = "@(#)regcomp.c 8.4 (Berkeley) 3/19/94"; 44 #endif /* LIBC_SCCS and not lint */ 45 46 #include <sys/types.h> 47 #include <stdio.h> 48 #include <string.h> 49 #include <ctype.h> 50 #include <limits.h> 51 #include <stdlib.h> 52 #include <regex.h> 53 54 #include "utils.h" 55 #include "regex2.h" 56 57 #include "cclass.h" 58 #include "cname.h" 59 60 /* 61 * parse structure, passed up and down to avoid global variables and 62 * other clumsinesses 63 */ 64 struct parse { 65 RCHAR_T *next; /* next character in RE */ 66 RCHAR_T *end; /* end of string (-> NUL normally) */ 67 int error; /* has an error been seen? */ 68 sop *strip; /* malloced strip */ 69 RCHAR_T *stripdata; /* malloced stripdata */ 70 sopno ssize; /* malloced strip size (allocated) */ 71 sopno slen; /* malloced strip length (used) */ 72 int ncsalloc; /* number of csets allocated */ 73 struct re_guts *g; 74 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 75 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 76 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 77 }; 78 79 /* ========= begin header generated by ./mkh ========= */ 80 #ifdef __cplusplus 81 extern "C" { 82 #endif 83 84 /* === regcomp.c === */ 85 static void p_ere __P((struct parse *p, int stop, size_t reclimit)); 86 static void p_ere_exp __P((struct parse *p, size_t reclimit)); 87 static void p_str __P((struct parse *p)); 88 static void p_bre __P((struct parse *p, int end1, int end2, size_t reclimit)); 89 static int p_simp_re __P((struct parse *p, int starordinary, size_t reclimit)); 90 static int p_count __P((struct parse *p)); 91 static void p_bracket __P((struct parse *p)); 92 static void p_b_term __P((struct parse *p, cset *cs)); 93 static void p_b_cclass __P((struct parse *p, cset *cs)); 94 static void p_b_eclass __P((struct parse *p, cset *cs)); 95 static char p_b_symbol __P((struct parse *p)); 96 static char p_b_coll_elem __P((struct parse *p, int endc)); 97 static char othercase __P((int ch)); 98 static void bothcases __P((struct parse *p, int ch)); 99 static void ordinary __P((struct parse *p, int ch)); 100 static void nonnewline __P((struct parse *p)); 101 static void repeat __P((struct parse *p, sopno start, int from, int to, size_t reclimit)); 102 static int seterr __P((struct parse *p, int e)); 103 static cset *allocset __P((struct parse *p)); 104 static void freeset __P((struct parse *p, cset *cs)); 105 static int freezeset __P((struct parse *p, cset *cs)); 106 static int firstch __P((struct parse *p, cset *cs)); 107 static int nch __P((struct parse *p, cset *cs)); 108 static void mcadd __P((struct parse *p, cset *cs, const char *cp)); 109 #ifdef notdef 110 static void mcsub __P((cset *cs, char *cp)); 111 static int mcin __P((cset *cs, char *cp)); 112 static char *mcfind __P((cset *cs, char *cp)); 113 #endif 114 static void mcinvert __P((struct parse *p, cset *cs)); 115 static void mccase __P((struct parse *p, cset *cs)); 116 #ifdef notdef 117 static int isinsets __P((struct re_guts *g, int c)); 118 static int samesets __P((struct re_guts *g, int c1, int c2)); 119 #endif 120 static void categorize __P((struct parse *p, struct re_guts *g)); 121 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 122 static void doemit __P((struct parse *p, sop op, size_t opnd)); 123 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 124 static void dofwd __P((struct parse *p, sopno pos, sop value)); 125 static int enlarge __P((struct parse *p, sopno size)); 126 static void stripsnug __P((struct parse *p, struct re_guts *g)); 127 static void findmust __P((struct parse *p, struct re_guts *g)); 128 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 129 130 #ifdef __cplusplus 131 } 132 #endif 133 /* ========= end header generated by ./mkh ========= */ 134 135 static RCHAR_T nuls[10]; /* place to point scanner in event of error */ 136 137 /* 138 * macros for use with parse structure 139 * BEWARE: these know that the parse structure is named `p' !!! 140 */ 141 #define PEEK() (*p->next) 142 #define PEEK2() (*(p->next+1)) 143 #define MORE() (p->next < p->end) 144 #define MORE2() (p->next+1 < p->end) 145 #define SEE(c) (MORE() && PEEK() == (c)) 146 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 147 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 148 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 149 #define NEXT() (p->next++) 150 #define NEXT2() (p->next += 2) 151 #define NEXTn(n) (p->next += (n)) 152 #define GETNEXT() (*p->next++) 153 #define SETERROR(e) seterr(p, (e)) 154 #define REQUIRE(co, e) ((co) || SETERROR(e)) 155 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 156 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 157 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 158 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 159 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 160 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 161 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 162 #define HERE() (p->slen) 163 #define THERE() (p->slen - 1) 164 #define THERETHERE() (p->slen - 2) 165 #define DROP(n) (p->slen -= (n)) 166 167 #ifndef NDEBUG 168 static int never = 0; /* for use in asserts; shuts lint up */ 169 #else 170 #define never 0 /* some <assert.h>s have bugs too */ 171 #endif 172 173 #define MEMLIMIT 0x8000000 174 #define MEMSIZE(p) \ 175 ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \ 176 (p)->ncsalloc * sizeof(cset) + \ 177 (p)->ssize * sizeof(sop)) 178 #define RECLIMIT 256 179 180 /* 181 - regcomp - interface for parser and compilation 182 = extern int regcomp(regex_t *, const RCHAR_T *, int); 183 = #define REG_BASIC 0000 184 = #define REG_EXTENDED 0001 185 = #define REG_ICASE 0002 186 = #define REG_NOSUB 0004 187 = #define REG_NEWLINE 0010 188 = #define REG_NOSPEC 0020 189 = #define REG_PEND 0040 190 = #define REG_DUMP 0200 191 */ 192 int /* 0 success, otherwise REG_something */ 193 regcomp(regex_t *preg, const RCHAR_T *pattern, int cflags) 194 { 195 struct parse pa; 196 register struct re_guts *g; 197 register struct parse *p = &pa; 198 register int i; 199 register size_t len; 200 #ifdef REDEBUG 201 # define GOODFLAGS(f) (f) 202 #else 203 # define GOODFLAGS(f) ((f)&~REG_DUMP) 204 #endif 205 206 cflags = GOODFLAGS(cflags); 207 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 208 return(REG_INVARG); 209 210 if (cflags®_PEND) { 211 if (preg->re_endp < pattern) 212 return(REG_INVARG); 213 len = preg->re_endp - pattern; 214 } else 215 len = STRLEN(pattern); 216 217 /* do the mallocs early so failure handling is easy */ 218 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 219 (NC-1)*sizeof(cat_t)); 220 if (g == NULL) 221 return(REG_ESPACE); 222 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 223 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 224 if (p->strip == NULL) { 225 free((char *)g); 226 return(REG_ESPACE); 227 } 228 p->stripdata = (RCHAR_T *)malloc(p->ssize * sizeof(RCHAR_T)); 229 if (p->stripdata == NULL) { 230 free((char *)p->strip); 231 free((char *)g); 232 return(REG_ESPACE); 233 } 234 p->slen = 0; 235 236 /* set things up */ 237 p->g = g; 238 p->next = (RCHAR_T *)pattern; /* convenience; we do not modify it */ 239 p->end = p->next + len; 240 p->error = 0; 241 p->ncsalloc = 0; 242 for (i = 0; i < NPAREN; i++) { 243 p->pbegin[i] = 0; 244 p->pend[i] = 0; 245 } 246 g->csetsize = NC; 247 g->sets = NULL; 248 g->setbits = NULL; 249 g->ncsets = 0; 250 g->cflags = cflags; 251 g->iflags = 0; 252 g->nbol = 0; 253 g->neol = 0; 254 g->must = NULL; 255 g->mlen = 0; 256 g->nsub = 0; 257 #if 0 258 g->ncategories = 1; /* category 0 is "everything else" */ 259 g->categories = &g->catspace[-(CHAR_MIN)]; 260 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 261 #endif 262 g->backrefs = 0; 263 264 /* do it */ 265 EMIT(OEND, 0); 266 g->firststate = THERE(); 267 if (cflags®_EXTENDED) 268 p_ere(p, OUT, 0); 269 else if (cflags®_NOSPEC) 270 p_str(p); 271 else 272 p_bre(p, OUT, OUT, 0); 273 EMIT(OEND, 0); 274 g->laststate = THERE(); 275 276 /* tidy up loose ends and fill things in */ 277 categorize(p, g); 278 stripsnug(p, g); 279 findmust(p, g); 280 g->nplus = pluscount(p, g); 281 g->magic = MAGIC2; 282 preg->re_nsub = g->nsub; 283 preg->re_g = g; 284 preg->re_magic = MAGIC1; 285 #ifndef REDEBUG 286 /* not debugging, so can't rely on the assert() in regexec() */ 287 if (g->iflags&BAD) 288 SETERROR(REG_ASSERT); 289 #endif 290 291 /* win or lose, we're done */ 292 if (p->error != 0) /* lose */ 293 regfree(preg); 294 return(p->error); 295 } 296 297 /* 298 - p_ere - ERE parser top level, concatenation and alternation 299 == static void p_ere(register struct parse *p, int stop, size_t reclimit); 300 */ 301 static void 302 p_ere(register struct parse *p, int stop, size_t reclimit) 303 304 /* character this ERE should end at */ 305 { 306 register char c; 307 register sopno prevback = 0; 308 register sopno prevfwd = 0; 309 register sopno conc; 310 register int first = 1; /* is this the first alternative? */ 311 312 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 313 p->error = REG_ESPACE; 314 return; 315 } 316 317 for (;;) { 318 /* do a bunch of concatenated expressions */ 319 conc = HERE(); 320 while (MORE() && (c = PEEK()) != '|' && c != stop) 321 p_ere_exp(p, reclimit); 322 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 323 324 if (!EAT('|')) 325 break; /* NOTE BREAK OUT */ 326 327 if (first) { 328 INSERT(OCH_, conc); /* offset is wrong */ 329 prevfwd = conc; 330 prevback = conc; 331 first = 0; 332 } 333 ASTERN(OOR1, prevback); 334 prevback = THERE(); 335 AHEAD(prevfwd); /* fix previous offset */ 336 prevfwd = HERE(); 337 EMIT(OOR2, 0); /* offset is very wrong */ 338 } 339 340 if (!first) { /* tail-end fixups */ 341 AHEAD(prevfwd); 342 ASTERN(O_CH, prevback); 343 } 344 345 assert(!MORE() || SEE(stop)); 346 } 347 348 /* 349 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 350 == static void p_ere_exp(register struct parse *p); 351 */ 352 static void 353 p_ere_exp(register struct parse *p, size_t reclimit) 354 { 355 register char c; 356 register sopno pos; 357 register int count; 358 register int count2; 359 register sopno subno; 360 int wascaret = 0; 361 362 assert(MORE()); /* caller should have ensured this */ 363 c = GETNEXT(); 364 365 pos = HERE(); 366 switch (c) { 367 case '(': 368 (void)REQUIRE(MORE(), REG_EPAREN); 369 p->g->nsub++; 370 subno = p->g->nsub; 371 if (subno < NPAREN) 372 p->pbegin[subno] = HERE(); 373 EMIT(OLPAREN, subno); 374 if (!SEE(')')) 375 p_ere(p, ')', reclimit); 376 if (subno < NPAREN) { 377 p->pend[subno] = HERE(); 378 assert(p->pend[subno] != 0); 379 } 380 EMIT(ORPAREN, subno); 381 (void)MUSTEAT(')', REG_EPAREN); 382 break; 383 #ifndef POSIX_MISTAKE 384 case ')': /* happens only if no current unmatched ( */ 385 /* 386 * You may ask, why the ifndef? Because I didn't notice 387 * this until slightly too late for 1003.2, and none of the 388 * other 1003.2 regular-expression reviewers noticed it at 389 * all. So an unmatched ) is legal POSIX, at least until 390 * we can get it fixed. 391 */ 392 SETERROR(REG_EPAREN); 393 break; 394 #endif 395 case '^': 396 EMIT(OBOL, 0); 397 p->g->iflags |= USEBOL; 398 p->g->nbol++; 399 wascaret = 1; 400 break; 401 case '$': 402 EMIT(OEOL, 0); 403 p->g->iflags |= USEEOL; 404 p->g->neol++; 405 break; 406 case '|': 407 SETERROR(REG_EMPTY); 408 break; 409 case '*': 410 case '+': 411 case '?': 412 SETERROR(REG_BADRPT); 413 break; 414 case '.': 415 if (p->g->cflags®_NEWLINE) 416 nonnewline(p); 417 else 418 EMIT(OANY, 0); 419 break; 420 case '[': 421 p_bracket(p); 422 break; 423 case '\\': 424 (void)REQUIRE(MORE(), REG_EESCAPE); 425 c = GETNEXT(); 426 ordinary(p, c); 427 break; 428 case '{': /* okay as ordinary except if digit follows */ 429 (void)REQUIRE(!MORE() || !ISDIGIT((UCHAR_T)PEEK()), REG_BADRPT); 430 /* FALLTHROUGH */ 431 default: 432 ordinary(p, c); 433 break; 434 } 435 436 if (!MORE()) 437 return; 438 c = PEEK(); 439 /* we call { a repetition if followed by a digit */ 440 if (!( c == '*' || c == '+' || c == '?' || 441 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) )) 442 return; /* no repetition, we're done */ 443 NEXT(); 444 445 (void)REQUIRE(!wascaret, REG_BADRPT); 446 switch (c) { 447 case '*': /* implemented as +? */ 448 /* this case does not require the (y|) trick, noKLUDGE */ 449 INSERT(OPLUS_, pos); 450 ASTERN(O_PLUS, pos); 451 INSERT(OQUEST_, pos); 452 ASTERN(O_QUEST, pos); 453 break; 454 case '+': 455 INSERT(OPLUS_, pos); 456 ASTERN(O_PLUS, pos); 457 break; 458 case '?': 459 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 460 INSERT(OCH_, pos); /* offset slightly wrong */ 461 ASTERN(OOR1, pos); /* this one's right */ 462 AHEAD(pos); /* fix the OCH_ */ 463 EMIT(OOR2, 0); /* offset very wrong... */ 464 AHEAD(THERE()); /* ...so fix it */ 465 ASTERN(O_CH, THERETHERE()); 466 break; 467 case '{': 468 count = p_count(p); 469 if (EAT(',')) { 470 if (ISDIGIT((UCHAR_T)PEEK())) { 471 count2 = p_count(p); 472 (void)REQUIRE(count <= count2, REG_BADBR); 473 } else /* single number with comma */ 474 count2 = INFINITY; 475 } else /* just a single number */ 476 count2 = count; 477 repeat(p, pos, count, count2, 0); 478 if (!EAT('}')) { /* error heuristics */ 479 while (MORE() && PEEK() != '}') 480 NEXT(); 481 (void)REQUIRE(MORE(), REG_EBRACE); 482 SETERROR(REG_BADBR); 483 } 484 break; 485 } 486 487 if (!MORE()) 488 return; 489 c = PEEK(); 490 if (!( c == '*' || c == '+' || c == '?' || 491 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ) ) 492 return; 493 SETERROR(REG_BADRPT); 494 } 495 496 /* 497 - p_str - string (no metacharacters) "parser" 498 == static void p_str(register struct parse *p); 499 */ 500 static void 501 p_str(register struct parse *p) 502 { 503 (void)REQUIRE(MORE(), REG_EMPTY); 504 while (MORE()) 505 ordinary(p, GETNEXT()); 506 } 507 508 /* 509 - p_bre - BRE parser top level, anchoring and concatenation 510 == static void p_bre(register struct parse *p, register int end1, \ 511 == register int end2, size_t reclimit); 512 * Giving end1 as OUT essentially eliminates the end1/end2 check. 513 * 514 * This implementation is a bit of a kludge, in that a trailing $ is first 515 * taken as an ordinary character and then revised to be an anchor. The 516 * only undesirable side effect is that '$' gets included as a character 517 * category in such cases. This is fairly harmless; not worth fixing. 518 * The amount of lookahead needed to avoid this kludge is excessive. 519 */ 520 static void 521 p_bre(register struct parse *p, register int end1, register int end2, size_t reclimit) 522 523 /* first terminating character */ 524 /* second terminating character */ 525 { 526 register sopno start; 527 register int first = 1; /* first subexpression? */ 528 register int wasdollar = 0; 529 530 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 531 p->error = REG_ESPACE; 532 return; 533 } 534 535 start = HERE(); 536 537 if (EAT('^')) { 538 EMIT(OBOL, 0); 539 p->g->iflags |= USEBOL; 540 p->g->nbol++; 541 } 542 while (MORE() && !SEETWO(end1, end2)) { 543 wasdollar = p_simp_re(p, first, reclimit); 544 first = 0; 545 } 546 if (wasdollar) { /* oops, that was a trailing anchor */ 547 DROP(1); 548 EMIT(OEOL, 0); 549 p->g->iflags |= USEEOL; 550 p->g->neol++; 551 } 552 553 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 554 } 555 556 /* 557 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 558 == static int p_simp_re(register struct parse *p, int starordinary, size_t reclimit); 559 */ 560 static int /* was the simple RE an unbackslashed $? */ 561 p_simp_re(register struct parse *p, int starordinary, size_t reclimit) 562 563 /* is a leading * an ordinary character? */ 564 { 565 register int c; 566 register int count; 567 register int count2; 568 register sopno pos; 569 register int i; 570 register sopno subno; 571 int backsl; 572 573 pos = HERE(); /* repetion op, if any, covers from here */ 574 575 assert(MORE()); /* caller should have ensured this */ 576 c = GETNEXT(); 577 backsl = c == '\\'; 578 if (backsl) { 579 (void)REQUIRE(MORE(), REG_EESCAPE); 580 c = (unsigned char)GETNEXT(); 581 switch (c) { 582 case '{': 583 SETERROR(REG_BADRPT); 584 break; 585 case '(': 586 p->g->nsub++; 587 subno = p->g->nsub; 588 if (subno < NPAREN) 589 p->pbegin[subno] = HERE(); 590 EMIT(OLPAREN, subno); 591 /* the MORE here is an error heuristic */ 592 if (MORE() && !SEETWO('\\', ')')) 593 p_bre(p, '\\', ')', reclimit); 594 if (subno < NPAREN) { 595 p->pend[subno] = HERE(); 596 assert(p->pend[subno] != 0); 597 } 598 EMIT(ORPAREN, subno); 599 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 600 break; 601 case ')': /* should not get here -- must be user */ 602 case '}': 603 SETERROR(REG_EPAREN); 604 break; 605 case '1': 606 case '2': 607 case '3': 608 case '4': 609 case '5': 610 case '6': 611 case '7': 612 case '8': 613 case '9': 614 i = c - '0'; 615 assert(i < NPAREN); 616 if (p->pend[i] != 0) { 617 assert(i <= p->g->nsub); 618 EMIT(OBACK_, i); 619 assert(p->pbegin[i] != 0); 620 assert(p->strip[p->pbegin[i]] == OLPAREN); 621 assert(p->strip[p->pend[i]] == ORPAREN); 622 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 623 EMIT(O_BACK, i); 624 } else 625 SETERROR(REG_ESUBREG); 626 p->g->backrefs = 1; 627 break; 628 default: 629 ordinary(p, c); 630 break; 631 } 632 } else { 633 switch (c) { 634 case '.': 635 if (p->g->cflags®_NEWLINE) 636 nonnewline(p); 637 else 638 EMIT(OANY, 0); 639 break; 640 case '[': 641 p_bracket(p); 642 break; 643 case '*': 644 (void)REQUIRE(starordinary, REG_BADRPT); 645 /* FALLTHROUGH */ 646 default: 647 ordinary(p, c); 648 break; 649 } 650 } 651 652 if (EAT('*')) { /* implemented as +? */ 653 /* this case does not require the (y|) trick, noKLUDGE */ 654 INSERT(OPLUS_, pos); 655 ASTERN(O_PLUS, pos); 656 INSERT(OQUEST_, pos); 657 ASTERN(O_QUEST, pos); 658 } else if (EATTWO('\\', '{')) { 659 count = p_count(p); 660 if (EAT(',')) { 661 if (MORE() && ISDIGIT((UCHAR_T)PEEK())) { 662 count2 = p_count(p); 663 (void)REQUIRE(count <= count2, REG_BADBR); 664 } else /* single number with comma */ 665 count2 = INFINITY; 666 } else /* just a single number */ 667 count2 = count; 668 repeat(p, pos, count, count2, reclimit); 669 if (!EATTWO('\\', '}')) { /* error heuristics */ 670 while (MORE() && !SEETWO('\\', '}')) 671 NEXT(); 672 (void)REQUIRE(MORE(), REG_EBRACE); 673 SETERROR(REG_BADBR); 674 } 675 } else if (!backsl && c == (unsigned char)'$') /* $ (but not \$) ends it */ 676 return(1); 677 678 return(0); 679 } 680 681 /* 682 - p_count - parse a repetition count 683 == static int p_count(register struct parse *p); 684 */ 685 static int /* the value */ 686 p_count(register struct parse *p) 687 { 688 register int count = 0; 689 register int ndigits = 0; 690 691 while (MORE() && ISDIGIT((UCHAR_T)PEEK()) && count <= DUPMAX) { 692 count = count*10 + (GETNEXT() - '0'); 693 ndigits++; 694 } 695 696 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 697 return(count); 698 } 699 700 /* 701 - p_bracket - parse a bracketed character list 702 == static void p_bracket(register struct parse *p); 703 * 704 * Note a significant property of this code: if the allocset() did SETERROR, 705 * no set operations are done. 706 */ 707 static void 708 p_bracket(register struct parse *p) 709 { 710 register cset *cs; 711 register int invert = 0; 712 static RCHAR_T bow[] = { '[', ':', '<', ':', ']', ']' }; 713 static RCHAR_T eow[] = { '[', ':', '>', ':', ']', ']' }; 714 715 cs = allocset(p); 716 if (cs == NULL) 717 return; 718 719 /* Dept of Truly Sickening Special-Case Kludges */ 720 if (p->next + 5 < p->end && MEMCMP(p->next, bow, 6) == 0) { 721 EMIT(OBOW, 0); 722 NEXTn(6); 723 return; 724 } 725 if (p->next + 5 < p->end && MEMCMP(p->next, eow, 6) == 0) { 726 EMIT(OEOW, 0); 727 NEXTn(6); 728 return; 729 } 730 731 if (EAT('^')) 732 invert++; /* make note to invert set at end */ 733 if (EAT(']')) 734 CHadd(cs, ']'); 735 else if (EAT('-')) 736 CHadd(cs, '-'); 737 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 738 p_b_term(p, cs); 739 if (EAT('-')) 740 CHadd(cs, '-'); 741 (void)MUSTEAT(']', REG_EBRACK); 742 743 if (p->error != 0) /* don't mess things up further */ 744 return; 745 746 if (p->g->cflags®_ICASE) { 747 register int i; 748 register int ci; 749 750 for (i = p->g->csetsize - 1; i >= 0; i--) 751 if (CHIN(cs, i) && isalpha(i)) { 752 ci = othercase(i); 753 if (ci != i) 754 CHadd(cs, ci); 755 } 756 if (cs->multis != NULL) 757 mccase(p, cs); 758 } 759 if (invert) { 760 register int i; 761 762 for (i = p->g->csetsize - 1; i >= 0; i--) 763 if (CHIN(cs, i)) 764 CHsub(cs, i); 765 else 766 CHadd(cs, i); 767 if (p->g->cflags®_NEWLINE) 768 CHsub(cs, '\n'); 769 if (cs->multis != NULL) 770 mcinvert(p, cs); 771 } 772 773 assert(cs->multis == NULL); /* xxx */ 774 775 if (nch(p, cs) == 1) { /* optimize singleton sets */ 776 ordinary(p, firstch(p, cs)); 777 freeset(p, cs); 778 } else 779 EMIT(OANYOF, freezeset(p, cs)); 780 } 781 782 /* 783 - p_b_term - parse one term of a bracketed character list 784 == static void p_b_term(register struct parse *p, register cset *cs); 785 */ 786 static void 787 p_b_term(register struct parse *p, register cset *cs) 788 { 789 register char c; 790 register char start, finish; 791 register int i; 792 793 /* classify what we've got */ 794 switch ((MORE()) ? PEEK() : '\0') { 795 case '[': 796 c = (MORE2()) ? PEEK2() : '\0'; 797 break; 798 case '-': 799 SETERROR(REG_ERANGE); 800 return; /* NOTE RETURN */ 801 break; 802 default: 803 c = '\0'; 804 break; 805 } 806 807 switch (c) { 808 case ':': /* character class */ 809 NEXT2(); 810 (void)REQUIRE(MORE(), REG_EBRACK); 811 c = PEEK(); 812 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 813 p_b_cclass(p, cs); 814 (void)REQUIRE(MORE(), REG_EBRACK); 815 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 816 break; 817 case '=': /* equivalence class */ 818 NEXT2(); 819 (void)REQUIRE(MORE(), REG_EBRACK); 820 c = PEEK(); 821 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 822 p_b_eclass(p, cs); 823 (void)REQUIRE(MORE(), REG_EBRACK); 824 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 825 break; 826 default: /* symbol, ordinary character, or range */ 827 /* xxx revision needed for multichar stuff */ 828 start = p_b_symbol(p); 829 if (SEE('-') && MORE2() && PEEK2() != ']') { 830 /* range */ 831 NEXT(); 832 if (EAT('-')) 833 finish = '-'; 834 else 835 finish = p_b_symbol(p); 836 } else 837 finish = start; 838 /* xxx what about signed chars here... */ 839 (void)REQUIRE(start <= finish, REG_ERANGE); 840 for (i = start; i <= finish; i++) 841 CHadd(cs, i); 842 break; 843 } 844 } 845 846 /* 847 - p_b_cclass - parse a character-class name and deal with it 848 == static void p_b_cclass(register struct parse *p, register cset *cs); 849 */ 850 static void 851 p_b_cclass(register struct parse *p, register cset *cs) 852 { 853 register RCHAR_T *sp = p->next; 854 register struct cclass *cp; 855 register size_t len; 856 register const char *u; 857 register char c; 858 859 while (MORE() && isalpha(PEEK())) 860 NEXT(); 861 len = p->next - sp; 862 for (cp = cclasses; cp->name != NULL; cp++) 863 if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len)) 864 break; 865 if (cp->name == NULL) { 866 /* oops, didn't find it */ 867 SETERROR(REG_ECTYPE); 868 return; 869 } 870 871 u = cp->chars; 872 while ((c = *u++) != '\0') 873 CHadd(cs, c); 874 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 875 MCadd(p, cs, u); 876 } 877 878 /* 879 - p_b_eclass - parse an equivalence-class name and deal with it 880 == static void p_b_eclass(register struct parse *p, register cset *cs); 881 * 882 * This implementation is incomplete. xxx 883 */ 884 static void 885 p_b_eclass(register struct parse *p, register cset *cs) 886 { 887 register char c; 888 889 c = p_b_coll_elem(p, '='); 890 CHadd(cs, c); 891 } 892 893 /* 894 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 895 == static char p_b_symbol(register struct parse *p); 896 */ 897 static char /* value of symbol */ 898 p_b_symbol(register struct parse *p) 899 { 900 register char value; 901 902 (void)REQUIRE(MORE(), REG_EBRACK); 903 if (!EATTWO('[', '.')) 904 return(GETNEXT()); 905 906 /* collating symbol */ 907 value = p_b_coll_elem(p, '.'); 908 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 909 return(value); 910 } 911 912 /* 913 - p_b_coll_elem - parse a collating-element name and look it up 914 == static char p_b_coll_elem(register struct parse *p, int endc); 915 */ 916 static char /* value of collating element */ 917 p_b_coll_elem(register struct parse *p, int endc) 918 919 /* name ended by endc,']' */ 920 { 921 register RCHAR_T *sp = p->next; 922 register struct cname *cp; 923 register size_t len; 924 925 while (MORE() && !SEETWO(endc, ']')) 926 NEXT(); 927 if (!MORE()) { 928 SETERROR(REG_EBRACK); 929 return(0); 930 } 931 len = p->next - sp; 932 for (cp = cnames; cp->name != NULL; cp++) 933 if (STRLEN(cp->name) == len && MEMCMP(cp->name, sp, len)) 934 return(cp->code); /* known name */ 935 if (len == 1) 936 return(*sp); /* single character */ 937 SETERROR(REG_ECOLLATE); /* neither */ 938 return(0); 939 } 940 941 /* 942 - othercase - return the case counterpart of an alphabetic 943 == static char othercase(int ch); 944 */ 945 static char /* if no counterpart, return ch */ 946 othercase(int ch) 947 { 948 assert(isalpha(ch)); 949 if (isupper(ch)) 950 return(tolower(ch)); 951 else if (islower(ch)) 952 return(toupper(ch)); 953 else /* peculiar, but could happen */ 954 return(ch); 955 } 956 957 /* 958 - bothcases - emit a dualcase version of a two-case character 959 == static void bothcases(register struct parse *p, int ch); 960 * 961 * Boy, is this implementation ever a kludge... 962 */ 963 static void 964 bothcases(register struct parse *p, int ch) 965 { 966 register RCHAR_T *oldnext = p->next; 967 register RCHAR_T *oldend = p->end; 968 RCHAR_T bracket[3]; 969 970 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 971 p->next = bracket; 972 p->end = bracket+2; 973 bracket[0] = ch; 974 bracket[1] = ']'; 975 bracket[2] = '\0'; 976 p_bracket(p); 977 assert(p->next == bracket+2); 978 p->next = oldnext; 979 p->end = oldend; 980 } 981 982 /* 983 - ordinary - emit an ordinary character 984 == static void ordinary(register struct parse *p, register int ch); 985 */ 986 static void 987 ordinary(register struct parse *p, register int ch) 988 { 989 /* 990 register cat_t *cap = p->g->categories; 991 */ 992 993 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 994 bothcases(p, ch); 995 else { 996 EMIT(OCHAR, (UCHAR_T)ch); 997 /* 998 if (cap[ch] == 0) 999 cap[ch] = p->g->ncategories++; 1000 */ 1001 } 1002 } 1003 1004 /* 1005 - nonnewline - emit REG_NEWLINE version of OANY 1006 == static void nonnewline(register struct parse *p); 1007 * 1008 * Boy, is this implementation ever a kludge... 1009 */ 1010 static void 1011 nonnewline(register struct parse *p) 1012 { 1013 register RCHAR_T *oldnext = p->next; 1014 register RCHAR_T *oldend = p->end; 1015 RCHAR_T bracket[4]; 1016 1017 p->next = bracket; 1018 p->end = bracket+3; 1019 bracket[0] = '^'; 1020 bracket[1] = '\n'; 1021 bracket[2] = ']'; 1022 bracket[3] = '\0'; 1023 p_bracket(p); 1024 assert(p->next == bracket+3); 1025 p->next = oldnext; 1026 p->end = oldend; 1027 } 1028 1029 /* 1030 - repeat - generate code for a bounded repetition, recursively if needed 1031 == static void repeat(register struct parse *p, sopno start, int from, int to, size_t reclimit); 1032 */ 1033 static void 1034 repeat(register struct parse *p, sopno start, int from, int to, size_t reclimit) 1035 1036 /* operand from here to end of strip */ 1037 /* repeated from this number */ 1038 /* to this number of times (maybe INFINITY) */ 1039 { 1040 register sopno finish; 1041 # define N 2 1042 # define INF 3 1043 # define REP(f, t) ((f)*8 + (t)) 1044 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1045 register sopno copy; 1046 1047 if (reclimit++ > RECLIMIT) 1048 p->error = REG_ESPACE; 1049 if (p->error) 1050 return; 1051 1052 finish = HERE(); 1053 1054 assert(from <= to); 1055 1056 switch (REP(MAP(from), MAP(to))) { 1057 case REP(0, 0): /* must be user doing this */ 1058 DROP(finish-start); /* drop the operand */ 1059 break; 1060 case REP(0, 1): /* as x{1,1}? */ 1061 case REP(0, N): /* as x{1,n}? */ 1062 case REP(0, INF): /* as x{1,}? */ 1063 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1064 INSERT(OCH_, start); /* offset is wrong... */ 1065 repeat(p, start+1, 1, to, reclimit); 1066 ASTERN(OOR1, start); 1067 AHEAD(start); /* ... fix it */ 1068 EMIT(OOR2, 0); 1069 AHEAD(THERE()); 1070 ASTERN(O_CH, THERETHERE()); 1071 break; 1072 case REP(1, 1): /* trivial case */ 1073 /* done */ 1074 break; 1075 case REP(1, N): /* as x?x{1,n-1} */ 1076 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1077 INSERT(OCH_, start); 1078 ASTERN(OOR1, start); 1079 AHEAD(start); 1080 EMIT(OOR2, 0); /* offset very wrong... */ 1081 AHEAD(THERE()); /* ...so fix it */ 1082 ASTERN(O_CH, THERETHERE()); 1083 copy = dupl(p, start+1, finish+1); 1084 assert(copy == finish+4); 1085 repeat(p, copy, 1, to-1, reclimit); 1086 break; 1087 case REP(1, INF): /* as x+ */ 1088 INSERT(OPLUS_, start); 1089 ASTERN(O_PLUS, start); 1090 break; 1091 case REP(N, N): /* as xx{m-1,n-1} */ 1092 copy = dupl(p, start, finish); 1093 repeat(p, copy, from-1, to-1, reclimit); 1094 break; 1095 case REP(N, INF): /* as xx{n-1,INF} */ 1096 copy = dupl(p, start, finish); 1097 repeat(p, copy, from-1, to, reclimit); 1098 break; 1099 default: /* "can't happen" */ 1100 SETERROR(REG_ASSERT); /* just in case */ 1101 break; 1102 } 1103 } 1104 1105 /* 1106 - seterr - set an error condition 1107 == static int seterr(register struct parse *p, int e); 1108 */ 1109 static int /* useless but makes type checking happy */ 1110 seterr(register struct parse *p, int e) 1111 { 1112 if (p->error == 0) /* keep earliest error condition */ 1113 p->error = e; 1114 p->next = nuls; /* try to bring things to a halt */ 1115 p->end = nuls; 1116 return(0); /* make the return value well-defined */ 1117 } 1118 1119 /* 1120 - allocset - allocate a set of characters for [] 1121 == static cset *allocset(register struct parse *p); 1122 */ 1123 static cset * 1124 allocset(register struct parse *p) 1125 { 1126 register int no = p->g->ncsets++; 1127 register size_t nc; 1128 register size_t nbytes; 1129 register cset *cs; 1130 register size_t css = (size_t)p->g->csetsize; 1131 register int i; 1132 1133 if (no >= p->ncsalloc) { /* need another column of space */ 1134 p->ncsalloc += CHAR_BIT; 1135 nc = p->ncsalloc; 1136 assert(nc % CHAR_BIT == 0); 1137 nbytes = nc / CHAR_BIT * css; 1138 if (MEMSIZE(p) > MEMLIMIT) 1139 goto oomem; 1140 if (p->g->sets == NULL) 1141 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1142 else 1143 p->g->sets = (cset *)realloc((char *)p->g->sets, 1144 nc * sizeof(cset)); 1145 if (p->g->setbits == NULL) 1146 p->g->setbits = (uch *)malloc(nbytes); 1147 else { 1148 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1149 nbytes); 1150 /* xxx this isn't right if setbits is now NULL */ 1151 for (i = 0; i < no; i++) 1152 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1153 } 1154 if (p->g->sets != NULL && p->g->setbits != NULL) 1155 (void) memset((char *)p->g->setbits + (nbytes - css), 1156 0, css); 1157 else { 1158 oomem: 1159 no = 0; 1160 SETERROR(REG_ESPACE); 1161 /* caller's responsibility not to do set ops */ 1162 return NULL; 1163 } 1164 } 1165 1166 cs = &p->g->sets[no]; 1167 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1168 cs->mask = 1 << ((no) % CHAR_BIT); 1169 cs->hash = 0; 1170 cs->smultis = 0; 1171 cs->multis = NULL; 1172 1173 return(cs); 1174 } 1175 1176 /* 1177 - freeset - free a now-unused set 1178 == static void freeset(register struct parse *p, register cset *cs); 1179 */ 1180 static void 1181 freeset(register struct parse *p, register cset *cs) 1182 { 1183 register size_t i; 1184 register cset *top = &p->g->sets[p->g->ncsets]; 1185 register size_t css = (size_t)p->g->csetsize; 1186 1187 for (i = 0; i < css; i++) 1188 CHsub(cs, i); 1189 if (cs == top-1) /* recover only the easy case */ 1190 p->g->ncsets--; 1191 } 1192 1193 /* 1194 - freezeset - final processing on a set of characters 1195 == static int freezeset(register struct parse *p, register cset *cs); 1196 * 1197 * The main task here is merging identical sets. This is usually a waste 1198 * of time (although the hash code minimizes the overhead), but can win 1199 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1200 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1201 * the same value! 1202 */ 1203 static int /* set number */ 1204 freezeset(register struct parse *p, register cset *cs) 1205 { 1206 register uch h = cs->hash; 1207 register size_t i; 1208 register cset *top = &p->g->sets[p->g->ncsets]; 1209 register cset *cs2; 1210 register size_t css = (size_t)p->g->csetsize; 1211 1212 /* look for an earlier one which is the same */ 1213 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1214 if (cs2->hash == h && cs2 != cs) { 1215 /* maybe */ 1216 for (i = 0; i < css; i++) 1217 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1218 break; /* no */ 1219 if (i == css) 1220 break; /* yes */ 1221 } 1222 1223 if (cs2 < top) { /* found one */ 1224 freeset(p, cs); 1225 cs = cs2; 1226 } 1227 1228 return((int)(cs - p->g->sets)); 1229 } 1230 1231 /* 1232 - firstch - return first character in a set (which must have at least one) 1233 == static int firstch(register struct parse *p, register cset *cs); 1234 */ 1235 static int /* character; there is no "none" value */ 1236 firstch(register struct parse *p, register cset *cs) 1237 { 1238 register size_t i; 1239 register size_t css = (size_t)p->g->csetsize; 1240 1241 for (i = 0; i < css; i++) 1242 if (CHIN(cs, i)) 1243 return((char)i); 1244 assert(never); 1245 return(0); /* arbitrary */ 1246 } 1247 1248 /* 1249 - nch - number of characters in a set 1250 == static int nch(register struct parse *p, register cset *cs); 1251 */ 1252 static int 1253 nch(register struct parse *p, register cset *cs) 1254 { 1255 register size_t i; 1256 register size_t css = (size_t)p->g->csetsize; 1257 register int n = 0; 1258 1259 for (i = 0; i < css; i++) 1260 if (CHIN(cs, i)) 1261 n++; 1262 return(n); 1263 } 1264 1265 /* 1266 - mcadd - add a collating element to a cset 1267 == static void mcadd(register struct parse *p, register cset *cs, \ 1268 == register char *cp); 1269 */ 1270 static void 1271 mcadd(register struct parse *p, register cset *cs, register const char *cp) 1272 { 1273 register size_t oldend = cs->smultis; 1274 void *np; 1275 1276 cs->smultis += strlen(cp) + 1; 1277 np = realloc(cs->multis, cs->smultis); 1278 if (np == NULL) { 1279 if (cs->multis) 1280 free(cs->multis); 1281 cs->multis = NULL; 1282 SETERROR(REG_ESPACE); 1283 return; 1284 } 1285 cs->multis = np; 1286 1287 (void) strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); 1288 } 1289 1290 #ifdef notdef 1291 /* 1292 - mcsub - subtract a collating element from a cset 1293 == static void mcsub(register cset *cs, register char *cp); 1294 */ 1295 static void 1296 mcsub(register cset *cs, register char *cp) 1297 { 1298 register char *fp = mcfind(cs, cp); 1299 register size_t len = strlen(fp); 1300 1301 assert(fp != NULL); 1302 (void) memmove(fp, fp + len + 1, 1303 cs->smultis - (fp + len + 1 - cs->multis)); 1304 cs->smultis -= len; 1305 1306 if (cs->smultis == 0) { 1307 free(cs->multis); 1308 cs->multis = NULL; 1309 return; 1310 } 1311 1312 cs->multis = realloc(cs->multis, cs->smultis); 1313 assert(cs->multis != NULL); 1314 } 1315 1316 /* 1317 - mcin - is a collating element in a cset? 1318 == static int mcin(register cset *cs, register char *cp); 1319 */ 1320 static int 1321 mcin(register cset *cs, register char *cp) 1322 { 1323 return(mcfind(cs, cp) != NULL); 1324 } 1325 1326 /* 1327 - mcfind - find a collating element in a cset 1328 == static char *mcfind(register cset *cs, register char *cp); 1329 */ 1330 static char * 1331 mcfind(register cset *cs, register char *cp) 1332 { 1333 register char *p; 1334 1335 if (cs->multis == NULL) 1336 return(NULL); 1337 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1338 if (strcmp(cp, p) == 0) 1339 return(p); 1340 return(NULL); 1341 } 1342 #endif 1343 1344 /* 1345 - mcinvert - invert the list of collating elements in a cset 1346 == static void mcinvert(register struct parse *p, register cset *cs); 1347 * 1348 * This would have to know the set of possibilities. Implementation 1349 * is deferred. 1350 */ 1351 static void 1352 mcinvert(register struct parse *p, register cset *cs) 1353 { 1354 assert(cs->multis == NULL); /* xxx */ 1355 } 1356 1357 /* 1358 - mccase - add case counterparts of the list of collating elements in a cset 1359 == static void mccase(register struct parse *p, register cset *cs); 1360 * 1361 * This would have to know the set of possibilities. Implementation 1362 * is deferred. 1363 */ 1364 static void 1365 mccase(register struct parse *p, register cset *cs) 1366 { 1367 assert(cs->multis == NULL); /* xxx */ 1368 } 1369 1370 #ifdef notdef 1371 /* 1372 - isinsets - is this character in any sets? 1373 == static int isinsets(register struct re_guts *g, int c); 1374 */ 1375 static int /* predicate */ 1376 isinsets(register struct re_guts *g, int c) 1377 { 1378 register uch *col; 1379 register int i; 1380 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1381 register unsigned uc = (unsigned char)c; 1382 1383 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1384 if (col[uc] != 0) 1385 return(1); 1386 return(0); 1387 } 1388 1389 /* 1390 - samesets - are these two characters in exactly the same sets? 1391 == static int samesets(register struct re_guts *g, int c1, int c2); 1392 */ 1393 static int /* predicate */ 1394 samesets(register struct re_guts *g, int c1, int c2) 1395 { 1396 register uch *col; 1397 register int i; 1398 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1399 register unsigned uc1 = (unsigned char)c1; 1400 register unsigned uc2 = (unsigned char)c2; 1401 1402 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1403 if (col[uc1] != col[uc2]) 1404 return(0); 1405 return(1); 1406 } 1407 #endif 1408 1409 /* 1410 - categorize - sort out character categories 1411 == static void categorize(struct parse *p, register struct re_guts *g); 1412 */ 1413 static void 1414 categorize(struct parse *p, register struct re_guts *g) 1415 { 1416 #ifdef notdef 1417 register cat_t *cats = g->categories; 1418 register int c; 1419 register int c2; 1420 register cat_t cat; 1421 1422 /* avoid making error situations worse */ 1423 if (p->error != 0) 1424 return; 1425 1426 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1427 if (cats[c] == 0 && isinsets(g, c)) { 1428 cat = g->ncategories++; 1429 cats[c] = cat; 1430 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1431 if (cats[c2] == 0 && samesets(g, c, c2)) 1432 cats[c2] = cat; 1433 } 1434 #endif 1435 } 1436 1437 /* 1438 - dupl - emit a duplicate of a bunch of sops 1439 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1440 */ 1441 static sopno /* start of duplicate */ 1442 dupl(register struct parse *p, sopno start, sopno finish) 1443 1444 /* from here */ 1445 /* to this less one */ 1446 { 1447 register sopno ret = HERE(); 1448 register sopno len = finish - start; 1449 1450 assert(finish >= start); 1451 if (len == 0) 1452 return(ret); 1453 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1454 return ret; 1455 assert(p->ssize >= p->slen + len); 1456 (void) memcpy((char *)(p->strip + p->slen), 1457 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1458 (void) memcpy((char *)(p->stripdata + p->slen), 1459 (char *)(p->stripdata + start), (size_t)len*sizeof(RCHAR_T)); 1460 p->slen += len; 1461 return(ret); 1462 } 1463 1464 /* 1465 - doemit - emit a strip operator 1466 == static void doemit(register struct parse *p, sop op, size_t opnd); 1467 * 1468 * It might seem better to implement this as a macro with a function as 1469 * hard-case backup, but it's just too big and messy unless there are 1470 * some changes to the data structures. Maybe later. 1471 */ 1472 static void 1473 doemit(register struct parse *p, sop op, size_t opnd) 1474 { 1475 /* avoid making error situations worse */ 1476 if (p->error != 0) 1477 return; 1478 1479 /* deal with oversize operands ("can't happen", more or less) */ 1480 assert(opnd < 1); 1481 1482 /* deal with undersized strip */ 1483 if (p->slen >= p->ssize) 1484 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1485 return; 1486 1487 /* finally, it's all reduced to the easy case */ 1488 p->strip[p->slen] = op; 1489 p->stripdata[p->slen] = opnd; 1490 p->slen++; 1491 } 1492 1493 /* 1494 - doinsert - insert a sop into the strip 1495 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1496 */ 1497 static void 1498 doinsert(register struct parse *p, sop op, size_t opnd, sopno pos) 1499 { 1500 register sopno sn; 1501 register sop s; 1502 register RCHAR_T d; 1503 register int i; 1504 1505 /* avoid making error situations worse */ 1506 if (p->error != 0) 1507 return; 1508 1509 sn = HERE(); 1510 EMIT(op, opnd); /* do checks, ensure space */ 1511 assert(HERE() == sn+1); 1512 s = p->strip[sn]; 1513 d = p->stripdata[sn]; 1514 1515 /* adjust paren pointers */ 1516 assert(pos > 0); 1517 for (i = 1; i < NPAREN; i++) { 1518 if (p->pbegin[i] >= pos) { 1519 p->pbegin[i]++; 1520 } 1521 if (p->pend[i] >= pos) { 1522 p->pend[i]++; 1523 } 1524 } 1525 1526 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1527 (HERE()-pos-1)*sizeof(sop)); 1528 memmove((char *)&p->stripdata[pos+1], (char *)&p->stripdata[pos], 1529 (HERE()-pos-1)*sizeof(RCHAR_T)); 1530 p->strip[pos] = s; 1531 p->stripdata[pos] = d; 1532 } 1533 1534 /* 1535 - dofwd - complete a forward reference 1536 == static void dofwd(register struct parse *p, sopno pos, sop value); 1537 */ 1538 static void 1539 dofwd(register struct parse *p, register sopno pos, sop value) 1540 { 1541 /* avoid making error situations worse */ 1542 if (p->error != 0) 1543 return; 1544 1545 assert(value < 1); 1546 p->stripdata[pos] = value; 1547 } 1548 1549 /* 1550 - enlarge - enlarge the strip 1551 == static int enlarge(register struct parse *p, sopno size); 1552 */ 1553 static int 1554 enlarge(register struct parse *p, register sopno size) 1555 { 1556 register sop *sp; 1557 register RCHAR_T *dp; 1558 sopno osize; 1559 1560 if (p->ssize >= size) 1561 return 1; 1562 1563 osize = p->ssize; 1564 p->ssize = size; 1565 if (MEMSIZE(p) > MEMLIMIT) 1566 goto oomem; 1567 sp = realloc(p->strip, p->ssize * sizeof(sop)); 1568 if (sp == NULL) 1569 goto oomem; 1570 p->strip = sp; 1571 dp = realloc(p->stripdata, p->ssize * sizeof(RCHAR_T)); 1572 if (dp == NULL) { 1573 oomem: 1574 p->ssize = osize; 1575 SETERROR(REG_ESPACE); 1576 return 0; 1577 } 1578 p->stripdata = dp; 1579 return 1; 1580 } 1581 1582 /* 1583 - stripsnug - compact the strip 1584 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1585 */ 1586 static void 1587 stripsnug(register struct parse *p, register struct re_guts *g) 1588 { 1589 g->nstates = p->slen; 1590 g->strip = (sop *)realloc((char *)p->strip, 1591 p->slen * sizeof(sop)); 1592 if (g->strip == NULL) { 1593 SETERROR(REG_ESPACE); 1594 g->strip = p->strip; 1595 } 1596 g->stripdata = (RCHAR_T *)realloc((char *)p->stripdata, 1597 p->slen * sizeof(RCHAR_T)); 1598 if (g->stripdata == NULL) { 1599 SETERROR(REG_ESPACE); 1600 g->stripdata = p->stripdata; 1601 } 1602 } 1603 1604 /* 1605 - findmust - fill in must and mlen with longest mandatory literal string 1606 == static void findmust(register struct parse *p, register struct re_guts *g); 1607 * 1608 * This algorithm could do fancy things like analyzing the operands of | 1609 * for common subsequences. Someday. This code is simple and finds most 1610 * of the interesting cases. 1611 * 1612 * Note that must and mlen got initialized during setup. 1613 */ 1614 static void 1615 findmust(struct parse *p, register struct re_guts *g) 1616 { 1617 register sop *scans; 1618 register RCHAR_T *scand; 1619 sop *starts = 0; 1620 RCHAR_T *startd = NULL; 1621 register sop *newstarts = 0; 1622 register RCHAR_T *newstartd = NULL; 1623 register sopno newlen; 1624 register sop s; 1625 register RCHAR_T d; 1626 register RCHAR_T *cp; 1627 register sopno i; 1628 1629 /* avoid making error situations worse */ 1630 if (p->error != 0) 1631 return; 1632 1633 /* find the longest OCHAR sequence in strip */ 1634 newlen = 0; 1635 scans = g->strip + 1; 1636 scand = g->stripdata + 1; 1637 do { 1638 s = *scans++; 1639 d = *scand++; 1640 switch (s) { 1641 case OCHAR: /* sequence member */ 1642 if (newlen == 0) { /* new sequence */ 1643 newstarts = scans - 1; 1644 newstartd = scand - 1; 1645 } 1646 newlen++; 1647 break; 1648 case OPLUS_: /* things that don't break one */ 1649 case OLPAREN: 1650 case ORPAREN: 1651 break; 1652 case OQUEST_: /* things that must be skipped */ 1653 case OCH_: 1654 scans--; 1655 scand--; 1656 do { 1657 scans += d; 1658 scand += d; 1659 s = *scans; 1660 d = *scand; 1661 /* assert() interferes w debug printouts */ 1662 if (s != O_QUEST && s != O_CH && s != OOR2) { 1663 g->iflags |= BAD; 1664 return; 1665 } 1666 } while (s != O_QUEST && s != O_CH); 1667 /* fallthrough */ 1668 default: /* things that break a sequence */ 1669 if (newlen > g->mlen) { /* ends one */ 1670 starts = newstarts; 1671 startd = newstartd; 1672 g->mlen = newlen; 1673 } 1674 newlen = 0; 1675 break; 1676 } 1677 } while (s != OEND); 1678 1679 if (g->mlen == 0) /* there isn't one */ 1680 return; 1681 1682 /* turn it into a character string */ 1683 g->must = malloc(((size_t)g->mlen + 1) * sizeof(RCHAR_T)); 1684 if (g->must == NULL) { /* argh; just forget it */ 1685 g->mlen = 0; 1686 return; 1687 } 1688 cp = g->must; 1689 scans = starts; 1690 scand = startd; 1691 for (i = g->mlen; i > 0; i--) { 1692 for (;;) { 1693 s = *scans++; 1694 d = *scand++; 1695 if (s == OCHAR) 1696 break; 1697 } 1698 assert(cp < g->must + g->mlen); 1699 *cp++ = d; 1700 } 1701 assert(cp == g->must + g->mlen); 1702 *cp++ = '\0'; /* just on general principles */ 1703 } 1704 1705 /* 1706 - pluscount - count + nesting 1707 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1708 */ 1709 static sopno /* nesting depth */ 1710 pluscount(struct parse *p, register struct re_guts *g) 1711 { 1712 register sop *scan; 1713 register sop s; 1714 register sopno plusnest = 0; 1715 register sopno maxnest = 0; 1716 1717 if (p->error != 0) 1718 return(0); /* there may not be an OEND */ 1719 1720 scan = g->strip + 1; 1721 do { 1722 s = *scan++; 1723 switch (s) { 1724 case OPLUS_: 1725 plusnest++; 1726 break; 1727 case O_PLUS: 1728 if (plusnest > maxnest) 1729 maxnest = plusnest; 1730 plusnest--; 1731 break; 1732 } 1733 } while (s != OEND); 1734 if (plusnest != 0) 1735 g->iflags |= BAD; 1736 return(maxnest); 1737 } 1738