1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2007 AT&T Knowledge Ventures * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Knowledge Ventures * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * posix regex compiler 26 */ 27 28 #include "reglib.h" 29 30 #if _PACKAGE_ast 31 #include "lclib.h" 32 #endif 33 34 #define serialize re_serialize /* hp.ia64 <unistd.h>! */ 35 36 #define C_ESC (-1) 37 #define C_MB (-2) 38 39 #if _AST_REGEX_DEBUG 40 41 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) 42 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) 43 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0) 44 45 static unsigned long debug; 46 static unsigned long debug_flag; 47 48 #else 49 50 #define DEBUG_INIT() 51 #define DEBUG_TEST(f,y,n) (n) 52 #define DEBUG_CODE(f,y,n) do {n} while(0) 53 54 #endif 55 56 #if _PACKAGE_ast 57 58 typedef struct Cchr_s 59 { 60 Dtlink_t lnk; 61 unsigned char nam[2]; 62 Ckey_t key; 63 } Cchr_t; 64 65 #endif 66 67 #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0) 68 69 /* 70 * determine whether greedy matching will work, i.e. produce 71 * the best match first. such expressions are "easy", and 72 * need no backtracking once a complete match is found. 73 * if an expression has backreferences or alts it's hard 74 * else if it has only one closure it's easy 75 * else if all closures are simple (i.e. one-character) it's easy 76 * else it's hard. 77 */ 78 79 typedef struct Stats_s 80 { 81 unsigned long l; /* min length to left of x */ 82 unsigned long k; /* min length to left of y */ 83 unsigned long m; /* min length */ 84 unsigned long n; /* max length */ 85 unsigned short a; /* number of alternations */ 86 unsigned short b; /* number of backrefs */ 87 unsigned short c; /* number of closures */ 88 unsigned short i; /* number of negations */ 89 unsigned short p; /* number of named subexpressions */ 90 unsigned short s; /* number of simple closures */ 91 unsigned short t; /* number of tries */ 92 unsigned short u; /* number of unnamed subexpressions */ 93 Rex_t* x; /* max length REX_STRING */ 94 Rex_t* y; /* max length REX_TRIE */ 95 } Stats_t; 96 97 typedef struct Token_s 98 { 99 unsigned long min; 100 unsigned long max; 101 short lex; 102 short len; 103 short esc; 104 short att; 105 short push; 106 } Token_t; 107 108 typedef struct Cenv_s 109 { 110 int delimiter; /* pattern delimiter */ 111 int error; /* last error */ 112 int explicit; /* explicit match on this char */ 113 int mappeddot; /* inverse mapped '.' */ 114 int mappednewline; /* inverse mapped '\n' */ 115 int mappedslash; /* inverse mapped '/' */ 116 regflags_t flags; /* flags arg to regcomp */ 117 int type; /* BRE,ERE,ARE,SRE,KRE */ 118 unsigned char* cursor; /* curent point in re */ 119 unsigned char* pattern; /* the original pattern */ 120 unsigned char* literal; /* literal restart pattern */ 121 int parno; /* number of last open paren */ 122 int parnest; /* paren nest count */ 123 int posixkludge; /* to make * nonspecial */ 124 Token_t token; /* token lookahead */ 125 Stats_t stats; /* RE statistics */ 126 int terminator; /* pattern terminator */ 127 Rex_t* paren[2*(BACK_REF_MAX+2)]; 128 /* paren[i]!=0 if \i defined */ 129 regex_t* regex; /* user handle */ 130 regdisc_t* disc; /* user discipline */ 131 unsigned char* map; /* external to native ccode map */ 132 unsigned char* MAP; /* fold and/or map */ 133 } Cenv_t; 134 135 /* 136 * allocate a new Rex_t node 137 */ 138 139 static Rex_t* 140 node(Cenv_t* env, int type, int lo, int hi, size_t extra) 141 { 142 register Rex_t* e; 143 144 if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra)) 145 { 146 memset(e, 0, sizeof(Rex_t) + extra); 147 e->type = type; 148 e->marked = 0; 149 e->lo = lo; 150 e->hi = hi; 151 e->flags = env->flags; 152 e->map = (e->flags & REG_ICASE) ? env->MAP : env->map; 153 e->explicit = env->explicit; 154 if (extra) 155 e->re.data = (char*)e + sizeof(Rex_t); 156 } 157 return e; 158 } 159 160 /* 161 * free a Trie_node_t node 162 */ 163 164 static void 165 triedrop(regdisc_t* disc, Trie_node_t* e) 166 { 167 if (e) 168 { 169 triedrop(disc, e->son); 170 triedrop(disc, e->sib); 171 alloc(disc, e, 0); 172 } 173 } 174 175 /* 176 * free a Rex_t node 177 */ 178 179 void 180 drop(regdisc_t* disc, Rex_t* e) 181 { 182 int i; 183 Rex_t* x; 184 185 if (e && !(disc->re_flags & REG_NOFREE)) 186 do 187 { 188 switch (e->type) 189 { 190 case REX_ALT: 191 case REX_CONJ: 192 drop(disc, e->re.group.expr.binary.left); 193 drop(disc, e->re.group.expr.binary.right); 194 break; 195 case REX_GROUP: 196 case REX_GROUP_AHEAD: 197 case REX_GROUP_AHEAD_NOT: 198 case REX_GROUP_BEHIND: 199 case REX_GROUP_BEHIND_NOT: 200 case REX_GROUP_CUT: 201 case REX_NEG: 202 case REX_REP: 203 drop(disc, e->re.group.expr.rex); 204 break; 205 case REX_TRIE: 206 for (i = 0; i <= UCHAR_MAX; i++) 207 triedrop(disc, e->re.trie.root[i]); 208 break; 209 } 210 x = e->next; 211 alloc(disc, e, 0); 212 } while (e = x); 213 } 214 215 /* 216 * mark e and descendants minimal 217 */ 218 219 static void 220 mark(register Rex_t* e, int set) 221 { 222 if (e && !e->marked) 223 do 224 { 225 e->marked = 1; 226 if (set) 227 e->flags |= REG_MINIMAL; 228 else 229 e->flags &= ~REG_MINIMAL; 230 switch (e->type) 231 { 232 case REX_ALT: 233 case REX_CONJ: 234 case REX_GROUP_COND: 235 if (e->re.group.expr.binary.left) 236 mark(e->re.group.expr.binary.left, set); 237 if (e->re.group.expr.binary.right) 238 mark(e->re.group.expr.binary.right, set); 239 break; 240 case REX_GROUP: 241 case REX_GROUP_AHEAD: 242 case REX_GROUP_AHEAD_NOT: 243 case REX_GROUP_BEHIND: 244 case REX_GROUP_BEHIND_NOT: 245 case REX_GROUP_CUT: 246 case REX_NEG: 247 case REX_REP: 248 case REX_TRIE: 249 mark(e->re.group.expr.rex, set); 250 break; 251 } 252 } while (e = e->next); 253 } 254 255 /* 256 * assign subexpression numbers by a preorder tree walk 257 */ 258 259 static int 260 serialize(Cenv_t* env, Rex_t* e, int n) 261 { 262 do 263 { 264 e->serial = n++; 265 switch (e->type) 266 { 267 case REX_ALT: 268 case REX_GROUP_COND: 269 if (e->re.group.expr.binary.left) 270 n = serialize(env, e->re.group.expr.binary.left, n); 271 e->re.group.expr.binary.serial = n++; 272 if (e->re.group.expr.binary.right) 273 n = serialize(env, e->re.group.expr.binary.right, n); 274 break; 275 case REX_CONJ: 276 n = serialize(env, e->re.group.expr.binary.left, n); 277 n = serialize(env, e->re.group.expr.binary.right, n); 278 break; 279 case REX_GROUP: 280 case REX_GROUP_AHEAD: 281 case REX_GROUP_AHEAD_NOT: 282 case REX_GROUP_BEHIND: 283 case REX_GROUP_BEHIND_NOT: 284 case REX_GROUP_CUT: 285 case REX_NEG: 286 case REX_REP: 287 n = serialize(env, e->re.group.expr.rex, n); 288 break; 289 } 290 } while (e = e->next); 291 return n; 292 } 293 294 /* 295 * catenate e and f into a sequence, collapsing them if possible 296 */ 297 298 static Rex_t* 299 cat(Cenv_t* env, Rex_t* e, Rex_t* f) 300 { 301 Rex_t* g; 302 303 if (!f) 304 { 305 drop(env->disc, e); 306 return 0; 307 } 308 if (e->type == REX_NULL) 309 { 310 drop(env->disc, e); 311 return f; 312 } 313 if (f->type == REX_NULL) 314 { 315 g = f->next; 316 f->next = 0; 317 drop(env->disc, f); 318 f = g; 319 } 320 else if (e->type == REX_DOT && f->type == REX_DOT) 321 { 322 unsigned int m = e->lo + f->lo; 323 unsigned int n = e->hi + f->hi; 324 325 if (m <= RE_DUP_MAX) 326 { 327 if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX) 328 { 329 n = RE_DUP_INF; 330 goto combine; 331 } 332 else if (n <= RE_DUP_MAX) 333 { 334 combine: 335 e->lo = m; 336 e->hi = n; 337 g = f->next; 338 f->next = 0; 339 drop(env->disc, f); 340 f = g; 341 } 342 } 343 } 344 e->next = f; 345 return e; 346 } 347 348 /* 349 * collect re statistics 350 */ 351 352 static int 353 stats(register Cenv_t* env, register Rex_t* e) 354 { 355 register unsigned long n; 356 register unsigned long m; 357 unsigned long cm; 358 unsigned long nm; 359 unsigned long cn; 360 unsigned long nn; 361 unsigned long l; 362 unsigned long k; 363 unsigned long t; 364 Rex_t* q; 365 Rex_t* x; 366 Rex_t* y; 367 unsigned char c; 368 unsigned char b; 369 370 do 371 { 372 switch (e->type) 373 { 374 case REX_ALT: 375 x = env->stats.x; 376 l = env->stats.l; 377 y = env->stats.y; 378 k = env->stats.k; 379 t = env->stats.t; 380 if (++env->stats.a <= 0) 381 return 1; 382 cm = env->stats.m; 383 env->stats.m = 0; 384 cn = env->stats.n; 385 env->stats.n = 0; 386 if (stats(env, e->re.group.expr.binary.left)) 387 return 1; 388 m = env->stats.m; 389 env->stats.m = 0; 390 n = env->stats.n; 391 env->stats.n = 0; 392 if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right)) 393 return 1; 394 if (env->stats.m > m) 395 env->stats.m = m; 396 else 397 m = env->stats.m; 398 if ((env->stats.m += cm) < m) 399 return 1; 400 if (env->stats.n < n) 401 env->stats.n = n; 402 else 403 n = env->stats.n; 404 if ((env->stats.n += cn) < n) 405 return 1; 406 env->stats.x = x; 407 env->stats.l = l; 408 env->stats.y = y; 409 env->stats.k = k; 410 env->stats.t = t; 411 break; 412 case REX_BACK: 413 if (++env->stats.b <= 0) 414 return 1; 415 break; 416 case REX_CLASS: 417 case REX_COLL_CLASS: 418 case REX_DOT: 419 case REX_ONECHAR: 420 n = env->stats.m; 421 if ((env->stats.m += e->lo) < n) 422 return 1; 423 if (e->hi != RE_DUP_INF) 424 { 425 n = env->stats.n; 426 if ((env->stats.n += e->hi) < n) 427 return 1; 428 } 429 if (e->lo != e->hi) 430 { 431 if (++env->stats.c <= 0) 432 return 1; 433 if (++env->stats.s <= 0) 434 return 1; 435 } 436 break; 437 case REX_CONJ: 438 cm = env->stats.m; 439 env->stats.m = 0; 440 cn = env->stats.n; 441 env->stats.n = 0; 442 if (stats(env, e->re.group.expr.binary.left)) 443 return 1; 444 nm = env->stats.m; 445 env->stats.m = 0; 446 nn = env->stats.n; 447 env->stats.n = 0; 448 if (stats(env, e->re.group.expr.binary.right)) 449 return 1; 450 if (env->stats.m < nm) 451 env->stats.m = nm; 452 else 453 nm = env->stats.m; 454 if ((env->stats.m += cm) < nm) 455 return 1; 456 if (env->stats.n < nn) 457 env->stats.n = nn; 458 else 459 nn = env->stats.n; 460 if ((env->stats.n += cn) < nn) 461 return 1; 462 break; 463 case REX_GROUP: 464 if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0) 465 return 1; 466 if (stats(env, e->re.group.expr.rex)) 467 return 1; 468 break; 469 case REX_GROUP_AHEAD: 470 case REX_GROUP_AHEAD_NOT: 471 case REX_GROUP_BEHIND: 472 case REX_GROUP_BEHIND_NOT: 473 m = env->stats.m; 474 n = env->stats.n; 475 x = env->stats.x; 476 y = env->stats.y; 477 if (stats(env, e->re.group.expr.rex)) 478 return 1; 479 env->stats.m = m; 480 env->stats.n = n; 481 env->stats.x = x; 482 env->stats.y = y; 483 switch (e->type) 484 { 485 case REX_GROUP_AHEAD: 486 case REX_GROUP_BEHIND: 487 if (++env->stats.u <= 0) 488 return 1; 489 break; 490 } 491 break; 492 case REX_GROUP_COND: 493 if (++env->stats.u <= 0) 494 return 1; 495 m = env->stats.m; 496 n = env->stats.n; 497 x = env->stats.x; 498 y = env->stats.y; 499 if (e->re.group.size > 0 && ++env->stats.b <= 0) 500 return 1; 501 if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left)) 502 return 1; 503 if (q = e->re.group.expr.binary.right) 504 { 505 if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left)) 506 return 1; 507 if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right)) 508 return 1; 509 } 510 env->stats.m = m; 511 env->stats.n = n; 512 env->stats.x = x; 513 env->stats.y = y; 514 break; 515 case REX_GROUP_CUT: 516 if (++env->stats.u <= 0) 517 return 1; 518 m = env->stats.m; 519 n = env->stats.n; 520 x = env->stats.x; 521 y = env->stats.y; 522 if (stats(env, e->re.group.expr.rex)) 523 return 1; 524 env->stats.m = m; 525 env->stats.n = n; 526 env->stats.x = x; 527 env->stats.y = y; 528 break; 529 case REX_NEG: 530 env->stats.i++; 531 x = env->stats.x; 532 l = env->stats.l; 533 y = env->stats.y; 534 k = env->stats.k; 535 t = env->stats.t; 536 cm = env->stats.m; 537 env->stats.m = 0; 538 if (stats(env, e->re.group.expr.rex)) 539 return 1; 540 env->stats.m = !env->stats.m; 541 if ((env->stats.m += cm) < cm) 542 return 1; 543 env->stats.x = x; 544 env->stats.l = l; 545 env->stats.y = y; 546 env->stats.k = k; 547 env->stats.t = t; 548 break; 549 case REX_REP: 550 x = env->stats.x; 551 l = env->stats.l; 552 y = env->stats.y; 553 k = env->stats.k; 554 t = env->stats.t; 555 if (++env->stats.c <= 0) 556 return 1; 557 b = env->stats.b; 558 c = env->stats.c; 559 cm = env->stats.m; 560 env->stats.m = 0; 561 if (stats(env, e->re.group.expr.rex)) 562 return 1; 563 if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0) 564 return 1; 565 if (e->lo < 1) 566 { 567 env->stats.x = x; 568 env->stats.l = l; 569 env->stats.y = y; 570 env->stats.k = k; 571 env->stats.t = t; 572 env->stats.m = cm; 573 } 574 else 575 { 576 m = env->stats.m; 577 if ((env->stats.m *= e->lo) > 0 && env->stats.m < m) 578 return 1; 579 m = env->stats.m; 580 if ((env->stats.m += cm) < m) 581 return 1; 582 if (env->stats.x != x) 583 env->stats.l = cm; 584 if (env->stats.y != y) 585 env->stats.k = cm; 586 } 587 break; 588 case REX_STRING: 589 cm = env->stats.m; 590 if ((env->stats.m += e->re.string.size) < cm) 591 return 1; 592 cn = env->stats.n; 593 if ((env->stats.n += e->re.string.size) < cn) 594 return 1; 595 if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size) 596 { 597 env->stats.x = e; 598 env->stats.l = cm; 599 } 600 break; 601 case REX_TRIE: 602 if (++env->stats.s <= 0) 603 return 1; 604 cm = env->stats.m; 605 if ((env->stats.m += e->re.trie.min) < cm) 606 return 1; 607 cn = env->stats.n; 608 if ((env->stats.n += e->re.trie.max) < cn) 609 return 1; 610 env->stats.t++; 611 if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min) 612 { 613 env->stats.y = e; 614 env->stats.k = cm; 615 } 616 break; 617 } 618 } while (e = e->next); 619 return 0; 620 } 621 622 static int token(Cenv_t*); 623 624 static int 625 magic(register Cenv_t* env, register int c, int escaped) 626 { 627 register char* sp; 628 register int n; 629 int o = c; 630 int e = env->error; 631 int l = env->token.len; 632 short* mp; 633 char* ep; 634 635 if (mp = state.magic[c]) 636 { 637 c = mp[env->type+escaped]; 638 if (c >= T_META) 639 { 640 sp = (char*)env->cursor + env->token.len; 641 switch (c) 642 { 643 case T_LEFT: 644 n = 0; 645 ep = sp; 646 while (*sp >= '0' && *sp <= '9') 647 { 648 if (n > (INT_MAX / 10)) 649 { 650 env->error = REG_BADBR; 651 goto bad; 652 } 653 n = n * 10 + *sp++ - '0'; 654 } 655 if (sp == ep) 656 { 657 env->error = *sp ? REG_BADBR : REG_EBRACE; 658 goto bad; 659 } 660 else if (n > RE_DUP_MAX) 661 { 662 env->error = REG_BADBR; 663 goto bad; 664 } 665 env->token.min = n; 666 if (*sp == ',') 667 { 668 n = 0; 669 ep = ++sp; 670 while (*sp >= '0' && *sp <= '9') 671 { 672 if (n > (INT_MAX / 10)) 673 { 674 env->error = REG_BADBR; 675 goto bad; 676 } 677 n = n * 10 + *sp++ - '0'; 678 } 679 if (sp == ep) 680 n = RE_DUP_INF; 681 else if (n < env->token.min) 682 { 683 env->error = REG_BADBR; 684 goto bad; 685 } 686 } 687 env->token.max = n; 688 switch (*sp) 689 { 690 case 0: 691 env->error = REG_EBRACE; 692 goto bad; 693 case '\\': 694 if (!escaped) 695 { 696 env->error = REG_BADBR; 697 goto bad; 698 } 699 sp++; 700 break; 701 default: 702 if (escaped) 703 { 704 env->error = REG_BADBR; 705 goto bad; 706 } 707 break; 708 } 709 switch (*sp++) 710 { 711 case 0: 712 env->error = REG_EBRACE; 713 goto bad; 714 case '}': 715 break; 716 default: 717 env->error = REG_BADBR; 718 goto bad; 719 } 720 env->token.len = sp - (char*)env->cursor; 721 break; 722 case T_RIGHT: 723 env->error = REG_EBRACE; 724 goto bad; 725 case T_OPEN: 726 if (env->type < SRE && *sp == '?') 727 { 728 env->token.len++; 729 env->token.lex = 0; 730 goto group; 731 } 732 break; 733 case T_ESCAPE: 734 c = chresc(sp - 2, &ep); 735 if (ep < sp) 736 goto bad; 737 env->token.len += ep - sp; 738 if (c >= T_META) 739 { 740 env->token.lex = c; 741 c = C_ESC; 742 } 743 return c; 744 case T_BACK+0: 745 case T_BACK+1: 746 case T_BACK+2: 747 case T_BACK+3: 748 case T_BACK+4: 749 case T_BACK+5: 750 case T_BACK+6: 751 case T_BACK+7: 752 n = chresc(sp - 2, &ep); 753 if (ep > sp + 1) 754 { 755 env->token.len += ep - sp; 756 return n; 757 } 758 /*FALLTHROUGH*/ 759 case T_BACK+8: 760 case T_BACK+9: 761 if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT)) 762 { 763 env->error = REG_BADESC; 764 goto bad; 765 } 766 if ((env->flags & REG_MULTIREF) && isdigit(*sp)) 767 { 768 c = (c - T_BACK) * 10 + (*sp - '0'); 769 if (c > 0 && c <= env->parno && env->paren[c]) 770 c += T_BACK; 771 else 772 c = chresc(sp - 2, &ep); 773 env->token.len++; 774 } 775 if (c == T_BACK) 776 c = 0; 777 break; 778 case T_BAD: 779 if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META) 780 return c; 781 goto bad; 782 } 783 if (env->type >= SRE) 784 { 785 if (c == T_DOT) 786 c = '.'; 787 else if (c < T_OPEN) 788 { 789 if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(') 790 { 791 env->token.len++; 792 env->token.att = 1; 793 } 794 if (env->type == KRE && *(env->cursor + env->token.len) == '(') 795 { 796 env->token.len++; 797 switch (c) 798 { 799 case T_AT: 800 break; 801 case T_PERCENT: 802 env->token.lex = c; 803 goto group; 804 case T_TILDE: 805 env->token.lex = 0; 806 goto group; 807 default: 808 env->token.lex = c; 809 break; 810 } 811 c = T_OPEN; 812 } 813 else if (c == T_STAR) 814 c = T_DOTSTAR; 815 else if (c == T_QUES) 816 c = T_DOT; 817 else 818 { 819 c = o; 820 env->token.len = l; 821 } 822 } 823 else if (c > T_BACK) 824 { 825 c = (c - T_BACK) * 2 - 1; 826 c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c; 827 } 828 else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP)) 829 { 830 if (c == T_AND) 831 c = '&'; 832 else if (c == T_BAR) 833 c = '|'; 834 else if (c == T_OPEN) 835 c = '('; 836 } 837 } 838 } 839 } 840 else if (escaped == 2) 841 { 842 if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter)) 843 /*ok*/; 844 else 845 { 846 env->error = REG_BADESC; 847 goto bad; 848 } 849 } 850 else if (escaped && !(env->flags & REG_LENIENT) && c != ']') 851 { 852 env->error = REG_BADESC; 853 goto bad; 854 } 855 return c; 856 group: 857 sp = (char*)env->cursor + env->token.len; 858 switch (*sp++) 859 { 860 case ')': 861 break; 862 case '#': 863 for (;;) 864 { 865 switch (*sp++) 866 { 867 case 0: 868 env->error = REG_EPAREN; 869 return T_BAD; 870 case ')': 871 break; 872 default: 873 continue; 874 } 875 break; 876 } 877 break; 878 default: 879 return T_GROUP; 880 } 881 env->cursor = (unsigned char*)sp; 882 return token(env); 883 bad: 884 if (escaped == 2) 885 env->error = e; 886 else if (env->flags & REG_LENIENT) 887 return o; 888 else if (escaped == 1 && !env->error) 889 { 890 if (mp || o == ']') 891 return o; 892 env->error = REG_BADESC; 893 } 894 return T_BAD; 895 } 896 897 static int 898 token(register Cenv_t* env) 899 { 900 int c; 901 int posixkludge; 902 903 if (env->token.push) 904 return env->token.lex; 905 env->token.att = env->token.esc = 0; 906 if ((env->token.len = MBSIZE(env->cursor)) > 1) 907 return env->token.lex = C_MB; 908 env->token.lex = 0; 909 for (;;) 910 { 911 c = *env->cursor; 912 if (c == 0 || c == env->delimiter || c == env->terminator) 913 return T_END; 914 if (!(env->flags & REG_COMMENT)) 915 break; 916 if (c == '#') 917 { 918 do 919 { 920 c = *++env->cursor; 921 if (c == 0 || c == env->delimiter) 922 return T_END; 923 } while (c != '\n'); 924 } 925 else if (!isspace(c)) 926 break; 927 env->cursor++; 928 } 929 if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter) 930 { 931 if (env->parnest) 932 { 933 env->error = REG_EPAREN; 934 return T_BAD; 935 } 936 env->parno = 0; 937 env->pattern = env->cursor + 1; 938 return T_BAR; 939 } 940 if (env->flags & REG_LITERAL) 941 return c; 942 if (posixkludge = env->posixkludge) 943 { 944 env->posixkludge = 0; 945 if (c == '*') 946 return c; 947 } 948 if (c == '\\') 949 { 950 if (env->flags & REG_SHELL_ESCAPED) 951 return c; 952 if (!(c = *(env->cursor + 1)) || c == env->terminator) 953 { 954 if (env->flags & REG_LENIENT) 955 { 956 if (c) 957 { 958 env->token.esc = env->token.len; 959 env->token.len += MBSIZE(env->cursor + 1); 960 return c; 961 } 962 return '\\'; 963 } 964 env->error = REG_EESCAPE; 965 return T_BAD; 966 } 967 env->token.esc = env->token.len; 968 env->token.len += MBSIZE(env->cursor + 1); 969 if (env->delimiter && c == 'n') 970 return '\n'; 971 else if (c == env->delimiter) 972 return magic(env, c, 0); 973 else if (c == '(' && env->type == BRE) 974 env->posixkludge = 1; 975 else if (c == ')' && env->type == BRE && env->parnest <= 0) 976 { 977 env->error = REG_EPAREN; 978 return T_BAD; 979 } 980 else if (isspace(c) && (env->flags & REG_COMMENT)) 981 return c; 982 return magic(env, c, 1); 983 } 984 else if (c == '$') 985 { 986 if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n') 987 return T_DOLL; 988 } 989 else if (c == '^') 990 { 991 if (env->type == BRE && (env->cursor == env->pattern || posixkludge)) 992 { 993 env->posixkludge = 1; 994 return T_CFLX; 995 } 996 } 997 else if (c == ')') 998 { 999 if (env->type != BRE && env->parnest <= 0) 1000 return c; 1001 } 1002 else if (c == '/' && env->explicit == env->mappedslash) 1003 { 1004 while (*(env->cursor + env->token.len) == c) 1005 env->token.len++; 1006 return T_SLASHPLUS; 1007 } 1008 return magic(env, c, 0); 1009 } 1010 1011 static Celt_t* 1012 col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec) 1013 { 1014 register unsigned char* s; 1015 register unsigned char* k; 1016 register unsigned char* e; 1017 register int c; 1018 register int cc; 1019 int bt; 1020 int et; 1021 Ckey_t key; 1022 1023 cc = 0; 1024 for (;;) 1025 { 1026 k = key; 1027 if (bw == 1) 1028 { 1029 c = bc; 1030 if (ic) 1031 { 1032 if (iswupper(c)) 1033 { 1034 c = towlower(c); 1035 cc = 1; 1036 } 1037 else if (iswlower(c)) 1038 { 1039 c = towupper(c); 1040 cc = 1; 1041 } 1042 } 1043 *k++ = c; 1044 } 1045 else if (bw < COLL_KEY_MAX) 1046 { 1047 s = bp; 1048 e = s + bw; 1049 while (s < e) 1050 { 1051 c = *s++; 1052 if (ic) 1053 { 1054 if (isupper(c)) 1055 { 1056 c = tolower(c); 1057 cc = 1; 1058 } 1059 else if (islower(c)) 1060 { 1061 c = toupper(c); 1062 cc = 1; 1063 } 1064 } 1065 *k++ = c; 1066 } 1067 } 1068 *k = 0; 1069 mbxfrm(ce->beg, key, COLL_KEY_MAX); 1070 if (ep) 1071 { 1072 k = key; 1073 c = mbchar(k); 1074 if (iswupper(c)) 1075 bt = COLL_range_uc; 1076 else if (iswlower(c)) 1077 bt = COLL_range_lc; 1078 else 1079 bt = COLL_range; 1080 k = key; 1081 if (ew == 1) 1082 { 1083 c = ec; 1084 if (ic) 1085 { 1086 if (iswupper(c)) 1087 { 1088 c = towlower(c); 1089 cc = 1; 1090 } 1091 else if (iswlower(c)) 1092 { 1093 c = towupper(c); 1094 cc = 1; 1095 } 1096 } 1097 *k++ = c; 1098 } 1099 else if (ew < COLL_KEY_MAX) 1100 { 1101 s = ep; 1102 e = s + ew; 1103 while (s < e) 1104 { 1105 c = *s++; 1106 if (ic) 1107 { 1108 if (isupper(c)) 1109 { 1110 c = tolower(c); 1111 cc = 1; 1112 } 1113 else if (islower(c)) 1114 { 1115 c = toupper(c); 1116 cc = 1; 1117 } 1118 } 1119 *k++ = c; 1120 } 1121 } 1122 *k = 0; 1123 mbxfrm(ce->end, key, COLL_KEY_MAX); 1124 k = key; 1125 c = mbchar(k); 1126 if (iswupper(c)) 1127 et = COLL_range_uc; 1128 else if (iswlower(c)) 1129 et = COLL_range_lc; 1130 else 1131 et = COLL_range; 1132 ce->typ = bt == et ? bt : COLL_range; 1133 } 1134 else 1135 ce->typ = COLL_char; 1136 ce++; 1137 if (!ic || !cc) 1138 break; 1139 ic = 0; 1140 } 1141 return ce; 1142 } 1143 1144 static Rex_t* 1145 bra(Cenv_t* env) 1146 { 1147 Rex_t* e; 1148 int c; 1149 int i; 1150 int w; 1151 int neg; 1152 int last; 1153 int inrange; 1154 int complicated; 1155 int collate; 1156 int elements; 1157 unsigned char* first; 1158 unsigned char* start; 1159 unsigned char* begin; 1160 unsigned char* s; 1161 regclass_t f; 1162 unsigned char buf[4 * (COLL_KEY_MAX + 1)]; 1163 #if _PACKAGE_ast 1164 int ic; 1165 char mbc[COLL_KEY_MAX + 1]; 1166 #endif 1167 1168 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 1169 return 0; 1170 collate = complicated = elements = 0; 1171 first = env->cursor; 1172 start = first + MBSIZE(first); 1173 if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!') 1174 { 1175 env->cursor++; 1176 neg = 1; 1177 } 1178 else 1179 neg = 0; 1180 if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter)) 1181 goto error; 1182 begin = env->cursor + MBSIZE(env->cursor); 1183 1184 /* 1185 * inrange: 0=no, 1=possibly, 2=definitely 1186 */ 1187 1188 inrange = 0; 1189 for (;;) 1190 { 1191 if (!(c = *env->cursor) || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 1192 goto error; 1193 env->cursor += (w = MBSIZE(env->cursor)); 1194 if (c == '\\') 1195 { 1196 if (*env->cursor) 1197 { 1198 if (*env->cursor == 'n') 1199 { 1200 env->cursor++; 1201 c = '\n'; 1202 } 1203 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 1204 { 1205 env->token.len = 1; 1206 w = magic(env, *env->cursor, 2); 1207 if (env->token.len > 1 || w != T_BAD) 1208 { 1209 if (env->token.len == 1 && (f = classfun(w))) 1210 { 1211 if (inrange > 1) 1212 { 1213 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1214 goto erange; 1215 inrange = 0; 1216 } 1217 env->cursor++; 1218 for (c = 0; c <= UCHAR_MAX; c++) 1219 if ((*f)(c)) 1220 setadd(e->re.charclass, c); 1221 complicated++; 1222 elements++; 1223 continue; 1224 } 1225 if (env->token.len > 1 || w >= 0 && w < T_META) 1226 { 1227 c = w; 1228 if (c > UCHAR_MAX) 1229 { 1230 if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide()) 1231 goto erange; 1232 c = UCHAR_MAX; 1233 } 1234 env->cursor += env->token.len; 1235 } 1236 } 1237 } 1238 } 1239 } 1240 else if (c == ']') 1241 { 1242 if (env->cursor == begin) 1243 { 1244 last = c; 1245 inrange = 1; 1246 continue; 1247 } 1248 if (inrange != 0) 1249 { 1250 setadd(e->re.charclass, last); 1251 elements++; 1252 if (inrange == 2) 1253 { 1254 setadd(e->re.charclass, '-'); 1255 elements++; 1256 } 1257 } 1258 break; 1259 } 1260 else if (c == '-') 1261 { 1262 if (!inrange && env->cursor != begin && *env->cursor != ']') 1263 { 1264 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1265 goto erange; 1266 continue; 1267 } 1268 else if (inrange == 1) 1269 { 1270 inrange = 2; 1271 complicated++; 1272 continue; 1273 } 1274 } 1275 else if (c == '[') 1276 { 1277 switch (*env->cursor) 1278 { 1279 case 0: 1280 goto error; 1281 case ':': 1282 if (inrange == 1) 1283 { 1284 setadd(e->re.charclass, last); 1285 elements++; 1286 } 1287 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 1288 { 1289 if (env->cursor == start && (c = *(env->cursor + 1))) 1290 { 1291 s = start = env->cursor + 1; 1292 while (*++s && *s != ':'); 1293 if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']') 1294 { 1295 if ((i = (s - start)) == 1) 1296 { 1297 switch (c) 1298 { 1299 case '<': 1300 i = REX_WBEG; 1301 break; 1302 case '>': 1303 i = REX_WEND; 1304 break; 1305 default: 1306 i = 0; 1307 break; 1308 } 1309 if (i) 1310 { 1311 env->cursor = s + 3; 1312 drop(env->disc, e); 1313 return node(env, i, 0, 0, 0); 1314 } 1315 } 1316 } 1317 } 1318 env->error = REG_ECTYPE; 1319 goto error; 1320 } 1321 for (c = 0; c <= UCHAR_MAX; c++) 1322 if ((*f)(c)) 1323 setadd(e->re.charclass, c); 1324 inrange = 0; 1325 complicated++; 1326 elements++; 1327 continue; 1328 case '=': 1329 if (inrange == 2) 1330 goto erange; 1331 if (inrange == 1) 1332 { 1333 setadd(e->re.charclass, last); 1334 elements++; 1335 } 1336 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 1337 goto ecollate; 1338 if (c > 1) 1339 collate++; 1340 else 1341 setadd(e->re.charclass, buf[0]); 1342 c = buf[0]; 1343 inrange = 0; 1344 complicated++; 1345 elements++; 1346 continue; 1347 case '.': 1348 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 1349 goto ecollate; 1350 if (c > 1) 1351 collate++; 1352 c = buf[0]; 1353 complicated++; 1354 break; 1355 default: 1356 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 1357 goto error; 1358 break; 1359 } 1360 } 1361 else if (w > 1) 1362 complicated++; 1363 if (inrange == 2) 1364 { 1365 if (last > c) 1366 { 1367 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1368 goto erange; 1369 setadd(e->re.charclass, last); 1370 setadd(e->re.charclass, c); 1371 } 1372 else 1373 for (i = last; i <= c; i++) 1374 setadd(e->re.charclass, i); 1375 inrange = env->type >= SRE || (env->flags & REG_LENIENT); 1376 elements += 2; 1377 } 1378 else if (inrange == 1) 1379 { 1380 setadd(e->re.charclass, last); 1381 elements++; 1382 } 1383 else 1384 inrange = 1; 1385 last = c; 1386 } 1387 #if _PACKAGE_ast 1388 if (complicated && mbcoll()) 1389 { 1390 Dt_t* dt; 1391 Cchr_t* cc; 1392 Cchr_t* tc; 1393 Cchr_t* xc; 1394 Celt_t* ce; 1395 Cchr_t key; 1396 int cw; 1397 int rw; 1398 int rc; 1399 unsigned char* rp; 1400 unsigned char* pp; 1401 1402 static Dtdisc_t disc; 1403 1404 static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; 1405 1406 if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data)) 1407 { 1408 disc.key = offsetof(Cchr_t, key); 1409 if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dttree))) 1410 { 1411 for (i = 0; i < elementsof(primary) - 1; i++, cc++) 1412 { 1413 cc->nam[0] = primary[i]; 1414 mbxfrm(cc->key, cc->nam, COLL_KEY_MAX); 1415 dtinsert(dt, cc); 1416 } 1417 for (i = 0; i < elementsof(cc->key); i++) 1418 cc->key[i] = ~0; 1419 dtinsert(dt, cc); 1420 LCINFO(AST_LC_COLLATE)->data = (void*)dt; 1421 } 1422 else 1423 { 1424 if (cc) 1425 free(cc); 1426 drop(env->disc, e); 1427 return 0; 1428 } 1429 } 1430 if (dt) 1431 { 1432 drop(env->disc, e); 1433 if (ic = env->flags & REG_ICASE) 1434 elements *= 2; 1435 if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 1) * sizeof(Celt_t)))) 1436 return 0; 1437 ce = (Celt_t*)e->re.data; 1438 e->re.collate.invert = neg; 1439 e->re.collate.elements = ce; 1440 env->cursor = first; 1441 inrange = 0; 1442 for (;;) 1443 { 1444 if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 1445 goto error; 1446 pp = env->cursor; 1447 env->cursor += (w = MBSIZE(env->cursor)); 1448 if (c == '\\') 1449 { 1450 if (*env->cursor) 1451 { 1452 if (*env->cursor == 'n') 1453 { 1454 pp = env->cursor++; 1455 c = '\n'; 1456 } 1457 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 1458 { 1459 env->token.len = 1; 1460 w = magic(env, *env->cursor, 2); 1461 if (env->token.len > 1 || w != T_BAD) 1462 { 1463 if (env->token.len == 1 && (f = classfun(w))) 1464 { 1465 if (inrange > 1) 1466 { 1467 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1468 goto erange; 1469 inrange = 0; 1470 } 1471 env->cursor++; 1472 ce->fun = f; 1473 ce->typ = COLL_call; 1474 ce++; 1475 continue; 1476 } 1477 if (env->token.len > 1 || w >= 0 && w < T_META) 1478 { 1479 c = w; 1480 w = wctomb(mbc, c); 1481 pp = (unsigned char*)mbc; 1482 env->cursor += env->token.len; 1483 } 1484 } 1485 } 1486 } 1487 } 1488 else if (c == ']') 1489 { 1490 if (env->cursor == begin) 1491 { 1492 rp = pp; 1493 rw = w; 1494 inrange = 1; 1495 continue; 1496 } 1497 if (inrange != 0) 1498 { 1499 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1500 if (inrange == 2) 1501 ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0); 1502 } 1503 break; 1504 } 1505 else if (c == '-') 1506 { 1507 if (!inrange && env->cursor != begin && *env->cursor != ']') 1508 { 1509 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1510 goto erange; 1511 continue; 1512 } 1513 else if (inrange == 1) 1514 { 1515 inrange = 2; 1516 continue; 1517 } 1518 } 1519 else if (c == '[') 1520 { 1521 switch (*env->cursor) 1522 { 1523 case 0: 1524 goto error; 1525 case ':': 1526 if (inrange == 1) 1527 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1528 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 1529 { 1530 if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']') 1531 { 1532 switch (c) 1533 { 1534 case '<': 1535 i = REX_WBEG; 1536 break; 1537 case '>': 1538 i = REX_WEND; 1539 break; 1540 default: 1541 i = 0; 1542 break; 1543 } 1544 if (i) 1545 { 1546 env->cursor += 5; 1547 drop(env->disc, e); 1548 return node(env, i, 0, 0, 0); 1549 } 1550 } 1551 env->error = REG_ECTYPE; 1552 goto error; 1553 } 1554 ce->fun = f; 1555 ce->typ = COLL_call; 1556 ce++; 1557 inrange = 0; 1558 continue; 1559 case '=': 1560 if (inrange == 2) 1561 goto erange; 1562 if (inrange == 1) 1563 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1564 rp = env->cursor + 1; 1565 if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf) - 1)) < 0) 1566 goto ecollate; 1567 c = 0; 1568 if (ic) 1569 for (i = 0; i < rw; i += MBSIZE(buf+i)) 1570 if (isupper(buf[i])) 1571 { 1572 buf[i] = tolower(buf[i]); 1573 c = 1; 1574 } 1575 else if (islower(buf[i])) 1576 c = 1; 1577 for (;;) 1578 { 1579 mbxfrm(key.key, buf, COLL_KEY_MAX); 1580 if (!(cc = (Cchr_t*)dtsearch(dt, &key))) 1581 { 1582 if (!isalnum(buf[0])) 1583 { 1584 strcpy((char*)ce->beg, (char*)key.key); 1585 ce->typ = COLL_char; 1586 ce++; 1587 break; 1588 } 1589 if (!(cc = (Cchr_t*)dtprev(dt, &key))) 1590 goto ecollate; 1591 } 1592 xc = cc; 1593 if (islower(buf[0])) 1594 { 1595 ce->typ = COLL_range_lc; 1596 if ((tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam)) 1597 xc = tc; 1598 } 1599 else if (isupper(buf[0])) 1600 { 1601 ce->typ = COLL_range_uc; 1602 if ((tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam)) 1603 xc = tc; 1604 } 1605 else 1606 { 1607 ce->typ = COLL_range; 1608 xc = cc; 1609 } 1610 strcpy((char*)ce->beg, (char*)xc->key); 1611 if (!(cc = (Cchr_t*)dtnext(dt, cc))) 1612 goto ecollate; 1613 if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc))) 1614 cc = tc; 1615 strcpy((char*)ce->end, (char*)cc->key); 1616 ce->max = -1; 1617 ce++; 1618 if (!c) 1619 break; 1620 c = 0; 1621 for (i = 0; i < rw; i++) 1622 if (islower(buf[i])) 1623 buf[i] = toupper(buf[i]); 1624 } 1625 inrange = 0; 1626 c = buf[0]; 1627 continue; 1628 case '.': 1629 pp = env->cursor + 1; 1630 if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf) - 1)) < 0) 1631 goto ecollate; 1632 if (w > 1) 1633 { 1634 if (w > COLL_KEY_MAX) 1635 goto ecollate; 1636 if (ic) 1637 for (i = 0; i < w; i += MBSIZE(buf+i)) 1638 if (isupper(buf[i])) 1639 buf[i] = tolower(buf[i]); 1640 cw = mbxfrm(ce->beg, buf, COLL_KEY_MAX); 1641 buf[1] = 0; 1642 if (mbxfrm(ce->beg, buf, COLL_KEY_MAX) < cw) 1643 goto ecollate; 1644 } 1645 c = buf[0]; 1646 break; 1647 default: 1648 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 1649 goto error; 1650 break; 1651 } 1652 } 1653 if (inrange == 2) 1654 { 1655 ce = col(ce, ic, rp, rw, rc, pp, w, c); 1656 if (strcmp((char*)ce->beg, (char*)ce->end) > 0) 1657 { 1658 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1659 goto erange; 1660 (ce-1)->typ = COLL_char; 1661 strcpy((char*)ce->beg, (char*)(ce-1)->end); 1662 ce->typ = COLL_char; 1663 ce++; 1664 } 1665 inrange = env->type >= SRE || (env->flags & REG_LENIENT); 1666 } 1667 else if (inrange == 1) 1668 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1669 else 1670 inrange = 1; 1671 rp = pp; 1672 rw = w; 1673 rc = c; 1674 } 1675 ce->typ = COLL_end; 1676 return e; 1677 } 1678 } 1679 #endif 1680 if (collate) 1681 goto ecollate; 1682 if (env->flags & REG_ICASE) 1683 for (i = 0; i <= UCHAR_MAX; i++) 1684 if (settst(e->re.charclass, i)) 1685 { 1686 if (isupper(i)) 1687 c = tolower(i); 1688 else if (islower(i)) 1689 c = toupper(i); 1690 else 1691 continue; 1692 setadd(e->re.charclass, c); 1693 } 1694 if (neg) 1695 { 1696 for (i = 0; i < elementsof(e->re.charclass->bits); i++) 1697 e->re.charclass->bits[i] ^= ~0; 1698 if (env->explicit >= 0) 1699 setclr(e->re.charclass, env->explicit); 1700 } 1701 return e; 1702 ecollate: 1703 env->error = REG_ECOLLATE; 1704 goto error; 1705 erange: 1706 env->error = REG_ERANGE; 1707 error: 1708 drop(env->disc, e); 1709 if (!env->error) 1710 env->error = REG_EBRACK; 1711 return 0; 1712 } 1713 1714 static Rex_t* 1715 ccl(Cenv_t* env, int type) 1716 { 1717 int i; 1718 Rex_t* e; 1719 Celt_t* ce; 1720 regclass_t f; 1721 1722 if (!(f = classfun(type))) 1723 { 1724 env->error = REG_BADESC; 1725 return 0; 1726 } 1727 if (!mbcoll()) 1728 { 1729 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 1730 return 0; 1731 for (i = 0; i <= UCHAR_MAX; i++) 1732 if ((*f)(i)) 1733 setadd(e->re.charclass, i); 1734 if (env->explicit >= 0) 1735 setclr(e->re.charclass, env->explicit); 1736 } 1737 else 1738 { 1739 if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t)))) 1740 return 0; 1741 ce = (Celt_t*)e->re.data; 1742 e->re.collate.invert = 0; 1743 e->re.collate.elements = ce; 1744 ce->typ = COLL_call; 1745 ce->fun = f; 1746 ce++; 1747 ce->typ = COLL_end; 1748 } 1749 return e; 1750 } 1751 1752 static Rex_t* 1753 rep(Cenv_t* env, Rex_t* e, int number, int last) 1754 { 1755 Rex_t* f; 1756 unsigned long m = 0; 1757 unsigned long n = RE_DUP_INF; 1758 int minimal = -1; 1759 1760 if (!e) 1761 return 0; 1762 switch (token(env)) 1763 { 1764 case T_BANG: 1765 eat(env); 1766 if (!(f = node(env, REX_NEG, m, n, 0))) 1767 { 1768 drop(env->disc, e); 1769 return 0; 1770 } 1771 f->re.group.expr.rex = e; 1772 return f; 1773 case T_QUES: 1774 eat(env); 1775 n = 1; 1776 break; 1777 case T_STAR: 1778 eat(env); 1779 break; 1780 case T_PLUS: 1781 eat(env); 1782 m = 1; 1783 break; 1784 case T_LEFT: 1785 eat(env); 1786 m = env->token.min; 1787 n = env->token.max; 1788 break; 1789 default: 1790 return e; 1791 } 1792 if (env->token.att) 1793 minimal = 1; 1794 else if (env->type < SRE) 1795 switch (token(env)) 1796 { 1797 case T_QUES: 1798 eat(env); 1799 minimal = !(env->flags & REG_MINIMAL); 1800 break; 1801 case T_STAR: /*AST*/ 1802 eat(env); 1803 minimal = !!(env->flags & REG_MINIMAL); 1804 break; 1805 } 1806 switch (e->type) 1807 { 1808 case REX_DOT: 1809 case REX_CLASS: 1810 case REX_COLL_CLASS: 1811 case REX_ONECHAR: 1812 e->lo = m; 1813 e->hi = n; 1814 if (minimal >= 0) 1815 mark(e, minimal); 1816 return e; 1817 #if HUH_2002_08_07 1818 case REX_BEG: 1819 #endif 1820 case REX_BEG_STR: 1821 case REX_END_STR: 1822 case REX_FIN_STR: 1823 case REX_WBEG: 1824 case REX_WEND: 1825 case REX_WORD: 1826 case REX_WORD_NOT: 1827 env->error = REG_BADRPT; 1828 drop(env->disc, e); 1829 return 0; 1830 } 1831 if (m == 1 && n == 1) 1832 { 1833 if (minimal >= 0) 1834 mark(e, minimal); 1835 return e; 1836 } 1837 if (!(f = node(env, REX_REP, m, n, 0))) 1838 { 1839 drop(env->disc, e); 1840 return 0; 1841 } 1842 f->re.group.expr.rex = e; 1843 f->re.group.number = number; 1844 f->re.group.last = last; 1845 if (minimal >= 0) 1846 mark(f, minimal); 1847 if (m <= n && n) 1848 { 1849 for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex); 1850 if (e && e->type == REX_NEG) 1851 f->type = REX_GROUP; 1852 } 1853 return f; 1854 } 1855 1856 static int 1857 isstring(Rex_t* e) 1858 { 1859 switch (e->type) 1860 { 1861 case REX_ONECHAR: 1862 return e->lo == 1 && e->hi == 1; 1863 case REX_STRING: 1864 return 1; 1865 } 1866 return 0; 1867 } 1868 1869 static Trie_node_t* 1870 trienode(Cenv_t* env, int c) 1871 { 1872 Trie_node_t* t; 1873 1874 if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t))) 1875 { 1876 memset(t, 0, sizeof(Trie_node_t)); 1877 t->c = c; 1878 } 1879 return t; 1880 } 1881 1882 static int 1883 insert(Cenv_t* env, Rex_t* f, Rex_t* g) 1884 { 1885 unsigned char* s; 1886 unsigned char* e; 1887 Trie_node_t* t; 1888 int len; 1889 unsigned char tmp[2]; 1890 1891 switch (f->type) 1892 { 1893 case REX_ONECHAR: 1894 *(s = tmp) = f->re.onechar; 1895 e = s + 1; 1896 break; 1897 case REX_STRING: 1898 s = f->re.string.base; 1899 e = s + f->re.string.size; 1900 break; 1901 default: 1902 return 1; 1903 } 1904 if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s))) 1905 return 1; 1906 for (len = 1;;) 1907 { 1908 if (t->c == *s) 1909 { 1910 if (++s >= e) 1911 break; 1912 if (!t->son && !(t->son = trienode(env, *s))) 1913 return 1; 1914 t = t->son; 1915 len++; 1916 } 1917 else 1918 { 1919 if (!t->sib && !(t->sib = trienode(env, *s))) 1920 return 1; 1921 t = t->sib; 1922 } 1923 } 1924 if (g->re.trie.min > len) 1925 g->re.trie.min = len; 1926 if (g->re.trie.max < len) 1927 g->re.trie.max = len; 1928 t->end = 1; 1929 return 0; 1930 } 1931 1932 /* 1933 * trie() tries to combine nontrivial e and f into a REX_TRIE 1934 * unless 0 is returned, e and f are deleted as far as possible 1935 * also called by regcomb 1936 */ 1937 1938 static Rex_t* 1939 trie(Cenv_t* env, Rex_t* e, Rex_t* f) 1940 { 1941 Rex_t* g; 1942 1943 if (e->next || f->next || !isstring(e) || e->flags != f->flags) 1944 return 0; 1945 if (isstring(f)) 1946 { 1947 if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*)))) 1948 return 0; 1949 g->re.trie.min = INT_MAX; 1950 if (insert(env, f, g)) 1951 goto nospace; 1952 drop(env->disc, f); 1953 } 1954 else if (f->type != REX_TRIE) 1955 return 0; 1956 else 1957 g = f; 1958 if (insert(env, e, g)) 1959 goto nospace; 1960 drop(env->disc, e); 1961 return g; 1962 nospace: 1963 if (g != f) 1964 drop(env->disc, g); 1965 return 0; 1966 } 1967 1968 static Rex_t* alt(Cenv_t*, int, int); 1969 1970 static int 1971 chr(register Cenv_t* env, int* escaped) 1972 { 1973 unsigned char* p; 1974 int c; 1975 1976 *escaped = 0; 1977 if (!(c = *env->cursor)) 1978 return -1; 1979 env->cursor++; 1980 if (c == '\\') 1981 { 1982 if (env->flags & REG_SHELL_ESCAPED) 1983 return c; 1984 if (!(c = *(env->cursor + 1)) || c == env->terminator) 1985 { 1986 if (env->flags & REG_LENIENT) 1987 return c ? c : '\\'; 1988 env->error = REG_EESCAPE; 1989 return -1; 1990 } 1991 p = env->cursor; 1992 c = chresc((char*)env->cursor - 1, (char**)&env->cursor); 1993 *escaped = env->cursor - p; 1994 } 1995 return c; 1996 } 1997 1998 /* 1999 * open the perly gates 2000 */ 2001 2002 static Rex_t* 2003 grp(Cenv_t* env, int parno) 2004 { 2005 Rex_t* e; 2006 Rex_t* f; 2007 int c; 2008 int i; 2009 int n; 2010 int x; 2011 int esc; 2012 int typ; 2013 int beg; 2014 unsigned char* p; 2015 2016 beg = env->pattern == env->cursor - env->token.len; 2017 if (!(c = env->token.lex) && (c = *env->cursor)) 2018 env->cursor++; 2019 env->token.len = 0; 2020 env->parnest++; 2021 typ = -1; 2022 switch (c) 2023 { 2024 case '-': 2025 case '+': 2026 case 'a': 2027 case 'g': 2028 case 'i': 2029 case 'l': 2030 case 'm': 2031 case 'p': 2032 case 'r': 2033 case 's': 2034 case 'x': 2035 case 'A': 2036 case 'B': 2037 case 'E': 2038 case 'F': 2039 case 'G': 2040 case 'I': 2041 case 'K': 2042 case 'M': 2043 case 'N': 2044 case 'R': 2045 case 'S': 2046 case 'U': /* pcre */ 2047 case 'X': /* pcre */ 2048 x = REX_GROUP; 2049 i = 1; 2050 env->token.push = 1; 2051 for (;;) 2052 { 2053 switch (c) 2054 { 2055 case 0: 2056 case T_CLOSE: 2057 x = 0; 2058 goto done; 2059 case ':': 2060 eat(env); 2061 if (token(env) == T_CLOSE) 2062 x = 0; 2063 goto done; 2064 case '-': 2065 i = 0; 2066 break; 2067 case '+': 2068 i = 1; 2069 break; 2070 case 'a': 2071 if (i) 2072 env->flags |= (REG_LEFT|REG_RIGHT); 2073 else 2074 env->flags &= ~(REG_LEFT|REG_RIGHT); 2075 break; 2076 case 'g': 2077 if (i) 2078 env->flags &= ~REG_MINIMAL; 2079 else 2080 env->flags |= REG_MINIMAL; 2081 break; 2082 case 'i': 2083 if (i) 2084 env->flags |= REG_ICASE; 2085 else 2086 env->flags &= ~REG_ICASE; 2087 break; 2088 case 'l': 2089 if (i) 2090 env->flags |= REG_LEFT; 2091 else 2092 env->flags &= ~REG_LEFT; 2093 break; 2094 case 'm': 2095 if (i) 2096 env->flags |= REG_NEWLINE; 2097 else 2098 env->flags &= ~REG_NEWLINE; 2099 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 2100 break; 2101 case 'p': 2102 if (i) 2103 env->flags &= ~REG_LENIENT; 2104 else 2105 env->flags |= REG_LENIENT; 2106 break; 2107 case 'r': 2108 if (i) 2109 env->flags |= REG_RIGHT; 2110 else 2111 env->flags &= ~REG_RIGHT; 2112 break; 2113 case 's': 2114 if (i) 2115 env->flags |= REG_SPAN; 2116 else 2117 env->flags &= ~REG_SPAN; 2118 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 2119 break; 2120 case 'x': 2121 if (i) 2122 env->flags |= REG_COMMENT; 2123 else 2124 env->flags &= ~REG_COMMENT; 2125 break; 2126 case 'A': 2127 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2128 env->flags |= REG_AUGMENTED|REG_EXTENDED; 2129 typ = ARE; 2130 break; 2131 case 'B': 2132 case 'G': 2133 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2134 typ = BRE; 2135 break; 2136 case 'E': 2137 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2138 env->flags |= REG_EXTENDED; 2139 typ = ERE; 2140 break; 2141 case 'F': 2142 case 'L': 2143 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2144 env->flags |= REG_LITERAL; 2145 typ = ERE; 2146 break; 2147 case 'K': 2148 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2149 env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT; 2150 typ = KRE; 2151 break; 2152 case 'M': 2153 /* used by caller to disable glob(3) GLOB_BRACE */ 2154 break; 2155 case 'N': 2156 /* used by caller to disable glob(3) GLOB_NOCHECK */ 2157 break; 2158 case 'R': 2159 /* used by caller to disable glob(3) GLOB_STARSTAR */ 2160 break; 2161 case 'S': 2162 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2163 env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT; 2164 typ = SRE; 2165 break; 2166 case 'U': /* PCRE_UNGREEDY */ 2167 if (i) 2168 env->flags |= REG_MINIMAL; 2169 else 2170 env->flags &= ~REG_MINIMAL; 2171 break; 2172 case 'X': /* PCRE_EXTRA */ 2173 break; 2174 default: 2175 env->error = REG_BADRPT; 2176 return 0; 2177 } 2178 eat(env); 2179 c = token(env); 2180 } 2181 done: 2182 break; 2183 case ':': 2184 x = REX_GROUP; 2185 break; 2186 case '=': 2187 x = REX_GROUP_AHEAD; 2188 break; 2189 case '!': 2190 x = REX_GROUP_AHEAD_NOT; 2191 break; 2192 case '<': 2193 switch (token(env)) 2194 { 2195 case '=': 2196 x = REX_GROUP_BEHIND; 2197 break; 2198 case '!': 2199 case T_BANG: 2200 x = REX_GROUP_BEHIND_NOT; 2201 break; 2202 default: 2203 env->error = REG_BADRPT; 2204 return 0; 2205 } 2206 eat(env); 2207 break; 2208 case '>': 2209 x = REX_GROUP_CUT; 2210 break; 2211 case '%': 2212 case T_PERCENT: 2213 e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short)); 2214 e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor; 2215 n = 1; 2216 for (;;) 2217 { 2218 switch (i = chr(env, &esc)) 2219 { 2220 case -1: 2221 case 0: 2222 invalid: 2223 env->cursor -= esc + 1; 2224 env->error = REG_EPAREN; 2225 return 0; 2226 case 'D': 2227 x = REX_NEST_delimiter; 2228 /*FALLTHROUGH*/ 2229 delimiter: 2230 if ((i = chr(env, &esc)) < 0) 2231 goto invalid; 2232 if (e->re.nest.type[i] & ~x) 2233 goto invalid; 2234 e->re.nest.type[i] = x; 2235 continue; 2236 case 'E': 2237 x = REX_NEST_escape; 2238 goto delimiter; 2239 case 'L': 2240 x = REX_NEST_literal; 2241 goto quote; 2242 case 'O': 2243 switch (i = chr(env, &esc)) 2244 { 2245 case 'T': 2246 e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator; 2247 break; 2248 default: 2249 goto invalid; 2250 } 2251 continue; 2252 case 'Q': 2253 x = REX_NEST_quote; 2254 /*FALLTHROUGH*/ 2255 quote: 2256 if ((i = chr(env, &esc)) < 0) 2257 goto invalid; 2258 if (e->re.nest.type[i] & ~x) 2259 goto invalid; 2260 e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT); 2261 continue; 2262 case 'S': 2263 x = REX_NEST_separator; 2264 goto delimiter; 2265 case 'T': 2266 x = REX_NEST_terminator; 2267 goto delimiter; 2268 case '|': 2269 case '&': 2270 if (!esc) 2271 goto invalid; 2272 goto nesting; 2273 case '(': 2274 if (!esc) 2275 n++; 2276 goto nesting; 2277 case ')': 2278 if (!esc && !--n) 2279 break; 2280 goto nesting; 2281 default: 2282 nesting: 2283 if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 2284 goto invalid; 2285 e->re.nest.type[i] = REX_NEST_open; 2286 if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 2287 goto invalid; 2288 if (!esc) 2289 { 2290 if (x == ')' && !--n) 2291 goto invalid; 2292 else if (x == '(') 2293 n++; 2294 } 2295 e->re.nest.type[x] |= REX_NEST_close; 2296 e->re.nest.type[i] |= x << REX_NEST_SHIFT; 2297 continue; 2298 } 2299 break; 2300 } 2301 env->parnest--; 2302 if (c == T_PERCENT) 2303 for (n = 0; n < 2; n++) 2304 { 2305 parno = ++env->parno; 2306 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2307 { 2308 drop(env->disc, e); 2309 return 0; 2310 } 2311 if (parno < elementsof(env->paren)) 2312 env->paren[parno] = f; 2313 f->re.group.back = 0; 2314 f->re.group.number = parno; 2315 f->re.group.expr.rex = e; 2316 e = f; 2317 } 2318 return e; 2319 case '(': 2320 c = 0; 2321 if (isdigit(*env->cursor)) 2322 { 2323 f = 0; 2324 do 2325 { 2326 if (c > (INT_MAX / 10)) 2327 { 2328 env->error = REG_BADRPT; 2329 return 0; 2330 } 2331 c = c * 10 + (*env->cursor++ - '0'); 2332 } while (isdigit(*env->cursor)); 2333 if (*env->cursor++ != ')') 2334 { 2335 env->error = REG_BADRPT; 2336 return 0; 2337 } 2338 if (c && env->type >= SRE) 2339 c = c * 2 - 1; 2340 if (!c || c > env->parno || !env->paren[c]) 2341 { 2342 if (!(env->flags & REG_LENIENT)) 2343 { 2344 env->error = REG_ESUBREG; 2345 return 0; 2346 } 2347 if (c) 2348 c = -1; 2349 } 2350 } 2351 else 2352 { 2353 if (env->type < SRE && *env->cursor++ != '?') 2354 { 2355 env->error = REG_BADRPT; 2356 return 0; 2357 } 2358 if (!(f = grp(env, parno + 1)) && env->error) 2359 return 0; 2360 } 2361 if (!(e = node(env, REX_GROUP_COND, 0, 0, 0))) 2362 { 2363 drop(env->disc, f); 2364 return 0; 2365 } 2366 e->re.group.size = c; 2367 e->re.group.expr.binary.left = f; 2368 if (!(e->re.group.expr.binary.right = alt(env, parno, 1))) 2369 { 2370 drop(env->disc, e); 2371 return 0; 2372 } 2373 if (token(env) != T_CLOSE) 2374 { 2375 env->error = REG_EPAREN; 2376 return 0; 2377 } 2378 eat(env); 2379 env->parnest--; 2380 return rep(env, e, parno, parno); 2381 case '{': 2382 p = env->cursor; 2383 n = 1; 2384 while (c = *env->cursor) 2385 { 2386 if (c == '\\' && *(env->cursor + 1)) 2387 env->cursor++; 2388 else if (c == '{') 2389 n++; 2390 else if (c == '}' && !--n) 2391 break; 2392 else if (c == env->delimiter || c == env->terminator) 2393 break; 2394 env->cursor++; 2395 } 2396 if (c != '}') 2397 { 2398 env->error = REG_EBRACE; 2399 return 0; 2400 } 2401 if (*++env->cursor != ')') 2402 { 2403 env->error = REG_EPAREN; 2404 return 0; 2405 } 2406 env->cursor++; 2407 env->parnest--; 2408 if (env->disc->re_version < REG_VERSION_EXEC) 2409 { 2410 env->error = REG_BADRPT; 2411 return 0; 2412 } 2413 if (!env->disc->re_execf) 2414 return 0; 2415 if (!(e = node(env, REX_EXEC, 0, 0, 0))) 2416 return 0; 2417 e->re.exec.text = (const char*)p; 2418 e->re.exec.size = env->cursor - p - 2; 2419 if (!env->disc->re_compf) 2420 e->re.exec.data = 0; 2421 else 2422 e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc); 2423 return e; 2424 case '0': case '1': case '2': case '3': case '4': 2425 case '5': case '6': case '7': case '8': case '9': 2426 c -= '0'; 2427 while (isdigit(*env->cursor)) 2428 { 2429 if (c > (INT_MAX / 10)) 2430 { 2431 env->error = REG_ESUBREG; 2432 return 0; 2433 } 2434 c = c * 10 + *env->cursor++ - '0'; 2435 } 2436 if (*env->cursor == ')') 2437 { 2438 env->cursor++; 2439 env->parnest--; 2440 env->token.len = 1; 2441 if (c > env->parno || !env->paren[c]) 2442 { 2443 env->error = REG_ESUBREG; 2444 return 0; 2445 } 2446 env->paren[c]->re.group.back = 1; 2447 return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 2448 } 2449 /*FALLTHROUGH*/ 2450 default: 2451 env->error = REG_BADRPT; 2452 return 0; 2453 } 2454 if (x && !(e = alt(env, parno, 0))) 2455 return 0; 2456 c = token(env); 2457 env->parnest--; 2458 if (c != T_CLOSE) 2459 { 2460 env->error = REG_EPAREN; 2461 return 0; 2462 } 2463 eat(env); 2464 if (typ >= 0) 2465 { 2466 if (beg) 2467 env->pattern = env->cursor; 2468 env->type = typ; 2469 } 2470 if (!x) 2471 return 0; 2472 if (!(f = node(env, x, 0, 0, 0))) 2473 { 2474 drop(env->disc, e); 2475 return 0; 2476 } 2477 f->re.group.expr.rex = e; 2478 if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT) 2479 { 2480 if (stats(env, e)) 2481 { 2482 drop(env->disc, f); 2483 if (!env->error) 2484 env->error = REG_ECOUNT; 2485 return 0; 2486 } 2487 f->re.group.size = env->stats.m; 2488 memset(&env->stats, 0, sizeof(env->stats)); 2489 } 2490 switch (x) 2491 { 2492 case REX_GROUP: 2493 case REX_GROUP_CUT: 2494 f = rep(env, f, parno, env->parno); 2495 break; 2496 } 2497 return f; 2498 } 2499 2500 static Rex_t* 2501 seq(Cenv_t* env) 2502 { 2503 Rex_t* e; 2504 Rex_t* f; 2505 Token_t tok; 2506 int c; 2507 int i; 2508 int n; 2509 int x; 2510 int parno; 2511 int type; 2512 regflags_t flags; 2513 unsigned char* s; 2514 unsigned char* p; 2515 unsigned char* t; 2516 unsigned char* u; 2517 unsigned char buf[256]; 2518 2519 for (;;) 2520 { 2521 s = buf; 2522 while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len]) 2523 { 2524 x = c; 2525 p = env->cursor; 2526 if (c >= 0) 2527 { 2528 n = 1; 2529 *s++ = (env->flags & REG_ICASE) ? toupper(c) : c; 2530 } 2531 else if (c == C_ESC) 2532 { 2533 if ((n = wctomb(NiL, env->token.lex)) < 0) 2534 continue; 2535 if (!n) 2536 { 2537 n = 1; 2538 *s++ = 0; 2539 } 2540 else if (s < &buf[sizeof(buf) - n]) 2541 { 2542 wctomb((char*)s, c); 2543 s += n; 2544 } 2545 else 2546 break; 2547 } 2548 else 2549 { 2550 n = env->token.len - env->token.esc; 2551 for (t = p, u = s + n; s < u; *s++ = *t++); 2552 } 2553 eat(env); 2554 } 2555 if (c == T_BAD) 2556 return 0; 2557 if (s > buf) 2558 switch (c) 2559 { 2560 case T_STAR: 2561 case T_PLUS: 2562 case T_LEFT: 2563 case T_QUES: 2564 case T_BANG: 2565 if ((s -= n) == buf) 2566 e = 0; 2567 else 2568 { 2569 i = s - buf; 2570 if (!(e = node(env, REX_STRING, 0, 0, i))) 2571 return 0; 2572 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i); 2573 e->re.string.size = i; 2574 } 2575 if (x >= 0) 2576 { 2577 if (!(f = node(env, REX_ONECHAR, 1, 1, 0))) 2578 { 2579 drop(env->disc, e); 2580 return 0; 2581 } 2582 f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x; 2583 } 2584 else 2585 { 2586 if (!(e = node(env, REX_STRING, 0, 0, n))) 2587 return 0; 2588 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)p, n); 2589 e->re.string.size = n; 2590 } 2591 if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env)))) 2592 { 2593 drop(env->disc, e); 2594 return 0; 2595 } 2596 if (e) 2597 f = cat(env, e, f); 2598 return f; 2599 default: 2600 c = s - buf; 2601 if (!(e = node(env, REX_STRING, 0, 0, c))) 2602 return 0; 2603 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c); 2604 e->re.string.size = c; 2605 return cat(env, e, seq(env)); 2606 } 2607 else if (c > T_BACK) 2608 { 2609 eat(env); 2610 c -= T_BACK; 2611 if (c > env->parno || !env->paren[c]) 2612 { 2613 env->error = REG_ESUBREG; 2614 return 0; 2615 } 2616 env->paren[c]->re.group.back = 1; 2617 e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 2618 } 2619 else 2620 switch (c) 2621 { 2622 case T_AND: 2623 case T_CLOSE: 2624 case T_BAR: 2625 case T_END: 2626 return node(env, REX_NULL, 0, 0, 0); 2627 case T_DOLL: 2628 eat(env); 2629 e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0); 2630 break; 2631 case T_CFLX: 2632 eat(env); 2633 if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED)) 2634 e = rep(env, e, 0, 0); 2635 break; 2636 case T_OPEN: 2637 tok = env->token; 2638 eat(env); 2639 flags = env->flags; 2640 type = env->type; 2641 if (env->token.att) 2642 env->flags |= REG_MINIMAL; 2643 env->parnest++; 2644 if (env->type == KRE) 2645 ++env->parno; 2646 parno = ++env->parno; 2647 if (!(e = alt(env, parno + 1, 0))) 2648 break; 2649 if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL)) 2650 { 2651 drop(env->disc, e); 2652 env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL; 2653 return 0; 2654 } 2655 if (token(env) != T_CLOSE) 2656 { 2657 drop(env->disc, e); 2658 env->error = REG_EPAREN; 2659 return 0; 2660 } 2661 env->parnest--; 2662 eat(env); 2663 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2664 { 2665 drop(env->disc, e); 2666 return 0; 2667 } 2668 if (parno < elementsof(env->paren)) 2669 env->paren[parno] = f; 2670 f->re.group.back = 0; 2671 f->re.group.number = parno; 2672 f->re.group.expr.rex = e; 2673 if (tok.lex) 2674 { 2675 tok.push = 1; 2676 env->token = tok; 2677 } 2678 if (!(e = rep(env, f, parno, env->parno))) 2679 return 0; 2680 if (env->type == KRE) 2681 { 2682 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2683 { 2684 drop(env->disc, e); 2685 return 0; 2686 } 2687 if (--parno < elementsof(env->paren)) 2688 env->paren[parno] = f; 2689 f->re.group.back = 0; 2690 f->re.group.number = parno; 2691 f->re.group.expr.rex = e; 2692 e = f; 2693 } 2694 env->flags = flags; 2695 env->type = type; 2696 break; 2697 case T_GROUP: 2698 p = env->cursor; 2699 eat(env); 2700 flags = env->flags; 2701 type = env->type; 2702 if (!(e = grp(env, env->parno + 1))) 2703 { 2704 if (env->error) 2705 return 0; 2706 if (env->literal == env->pattern && env->literal == p) 2707 env->literal = env->cursor; 2708 continue; 2709 } 2710 env->flags = flags; 2711 env->type = type; 2712 break; 2713 case T_BRA: 2714 eat(env); 2715 if (e = bra(env)) 2716 e = rep(env, e, 0, 0); 2717 break; 2718 case T_ALNUM: 2719 case T_ALNUM_NOT: 2720 case T_DIGIT: 2721 case T_DIGIT_NOT: 2722 case T_SPACE: 2723 case T_SPACE_NOT: 2724 eat(env); 2725 if (e = ccl(env, c)) 2726 e = rep(env, e, 0, 0); 2727 break; 2728 case T_LT: 2729 eat(env); 2730 e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0); 2731 break; 2732 case T_GT: 2733 eat(env); 2734 e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0); 2735 break; 2736 case T_DOT: 2737 eat(env); 2738 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 2739 break; 2740 case T_DOTSTAR: 2741 eat(env); 2742 env->token.lex = T_STAR; 2743 env->token.push = 1; 2744 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 2745 break; 2746 case T_SLASHPLUS: 2747 eat(env); 2748 env->token.lex = T_PLUS; 2749 env->token.push = 1; 2750 if (e = node(env, REX_ONECHAR, 1, 1, 0)) 2751 { 2752 e->re.onechar = '/'; 2753 e = rep(env, e, 0, 0); 2754 } 2755 break; 2756 case T_WORD: 2757 eat(env); 2758 e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0); 2759 break; 2760 case T_WORD_NOT: 2761 eat(env); 2762 e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0); 2763 break; 2764 case T_BEG_STR: 2765 eat(env); 2766 e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0); 2767 break; 2768 case T_END_STR: 2769 eat(env); 2770 e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0); 2771 break; 2772 case T_FIN_STR: 2773 eat(env); 2774 e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0); 2775 break; 2776 default: 2777 env->error = REG_BADRPT; 2778 return 0; 2779 } 2780 if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator) 2781 e = cat(env, e, seq(env)); 2782 return e; 2783 } 2784 } 2785 2786 static Rex_t* 2787 con(Cenv_t* env) 2788 { 2789 Rex_t* e; 2790 Rex_t* f; 2791 Rex_t* g; 2792 2793 if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND) 2794 return e; 2795 eat(env); 2796 if (!(f = con(env))) 2797 { 2798 drop(env->disc, e); 2799 return 0; 2800 } 2801 if (!(g = node(env, REX_CONJ, 0, 0, 0))) 2802 { 2803 drop(env->disc, e); 2804 drop(env->disc, f); 2805 return 0; 2806 } 2807 g->re.group.expr.binary.left = e; 2808 g->re.group.expr.binary.right = f; 2809 return g; 2810 } 2811 2812 static Rex_t* 2813 alt(Cenv_t* env, int number, int cond) 2814 { 2815 Rex_t* e; 2816 Rex_t* f; 2817 Rex_t* g; 2818 2819 if (!(e = con(env))) 2820 return 0; 2821 else if (token(env) != T_BAR) 2822 { 2823 if (!cond) 2824 return e; 2825 f = 0; 2826 if (e->type == REX_NULL) 2827 goto bad; 2828 } 2829 else 2830 { 2831 eat(env); 2832 if (!(f = alt(env, number, 0))) 2833 { 2834 drop(env->disc, e); 2835 return 0; 2836 } 2837 if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL)) 2838 goto bad; 2839 if (!cond && (g = trie(env, e, f))) 2840 return g; 2841 } 2842 if (!(g = node(env, REX_ALT, 0, 0, 0))) 2843 { 2844 env->error = REG_ESPACE; 2845 goto bad; 2846 } 2847 g->re.group.number = number; 2848 g->re.group.last = env->parno; 2849 g->re.group.expr.binary.left = e; 2850 g->re.group.expr.binary.right = f; 2851 return g; 2852 bad: 2853 drop(env->disc, e); 2854 drop(env->disc, f); 2855 if (!env->error) 2856 env->error = REG_ENULL; 2857 return 0; 2858 } 2859 2860 /* 2861 * add v to REX_BM tables 2862 */ 2863 2864 static void 2865 bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b) 2866 { 2867 int c; 2868 int m; 2869 size_t z; 2870 2871 for (m = 0; m < n; m++) 2872 { 2873 if (!(z = n - m - 1)) 2874 z = HIT; 2875 c = v[m]; 2876 a->re.bm.mask[m][c] |= b; 2877 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 2878 a->re.bm.skip[c] = z; 2879 if (a->flags & REG_ICASE) 2880 { 2881 if (isupper(c)) 2882 c = tolower(c); 2883 else if (islower(c)) 2884 c = toupper(c); 2885 else 2886 continue; 2887 a->re.bm.mask[m][c] |= b; 2888 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 2889 a->re.bm.skip[c] = z; 2890 } 2891 } 2892 } 2893 2894 /* 2895 * set up BM table from trie 2896 */ 2897 2898 static int 2899 bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b) 2900 { 2901 do 2902 { 2903 v[m] = x->c; 2904 if (m >= (n - 1)) 2905 { 2906 bmstr(env, a, v, n, b); 2907 if (!(b <<= 1)) 2908 { 2909 b = 1; 2910 a->re.bm.complete = 0; 2911 } 2912 else if (x->son) 2913 a->re.bm.complete = 0; 2914 } 2915 else if (x->son) 2916 b = bmtrie(env, a, v, x->son, n, m + 1, b); 2917 } while (x = x->sib); 2918 return b; 2919 } 2920 2921 /* 2922 * rewrite the expression tree for some special cases 2923 * 1. it is a null expression - illegal 2924 * 2. max length fixed string found -- use BM algorithm 2925 * 3. it begins with an unanchored string - use KMP algorithm 2926 * 0 returned on success 2927 */ 2928 2929 static int 2930 special(Cenv_t* env, regex_t* p) 2931 { 2932 Rex_t* a; 2933 Rex_t* e; 2934 Rex_t* t; 2935 Rex_t* x; 2936 Rex_t* y; 2937 unsigned char* s; 2938 int* f; 2939 int n; 2940 int m; 2941 int k; 2942 2943 DEBUG_INIT(); 2944 if (e = p->env->rex) 2945 { 2946 if ((x = env->stats.x) && x->re.string.size < 3) 2947 x = 0; 2948 if ((t = env->stats.y) && t->re.trie.min < 3) 2949 t = 0; 2950 if (x && t) 2951 { 2952 if (x->re.string.size >= t->re.trie.min) 2953 t = 0; 2954 else 2955 x = 0; 2956 } 2957 if ((x || t) && !env->map) /* HERE: figure out map */ 2958 { 2959 Bm_mask_t** mask; 2960 Bm_mask_t* h; 2961 unsigned char* v; 2962 size_t* q; 2963 unsigned long l; 2964 int i; 2965 int j; 2966 2967 if (x) 2968 { 2969 y = x; 2970 n = m = x->re.string.size; 2971 l = env->stats.l; 2972 } 2973 else 2974 { 2975 y = t; 2976 n = t->re.trie.min; 2977 m = t->re.trie.max; 2978 l = env->stats.k; 2979 } 2980 if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t)))) 2981 return 1; 2982 if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t)))) 2983 { 2984 alloc(env->disc, q, 0); 2985 return 1; 2986 } 2987 a->flags = y->flags; 2988 a->map = y->map; 2989 a->re.bm.size = n; 2990 a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1; 2991 a->re.bm.left = l - 1; 2992 a->re.bm.right = env->stats.m - l - n; 2993 a->re.bm.complete = (y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n; 2994 h = (Bm_mask_t*)&a->re.bm.mask[n]; 2995 a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1)); 2996 a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1]; 2997 for (m = 0; m <= UCHAR_MAX; m++) 2998 a->re.bm.skip[m] = n; 2999 a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left); 3000 for (i = 1; i <= n; i++) 3001 a->re.bm.fail[i] = 2 * n - i; 3002 mask = a->re.bm.mask; 3003 for (m = 0; m < n; m++) 3004 { 3005 mask[m] = h; 3006 h += UCHAR_MAX + 1; 3007 } 3008 if (x) 3009 bmstr(env, a, x->re.string.base, n, 1); 3010 else 3011 { 3012 v = (unsigned char*)q; 3013 memset(v, 0, n); 3014 m = 1; 3015 for (i = 0; i <= UCHAR_MAX; i++) 3016 if (t->re.trie.root[i]) 3017 m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m); 3018 } 3019 mask--; 3020 memset(q, 0, n * sizeof(*q)); 3021 for (k = (j = n) + 1; j > 0; j--, k--) 3022 { 3023 DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0)); 3024 for (q[j] = k; k <= n; k = q[k]) 3025 { 3026 for (m = 0; m <= UCHAR_MAX; m++) 3027 if (mask[k][m] == mask[j][m]) 3028 { 3029 DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0)); 3030 goto cut; 3031 } 3032 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0)); 3033 if (a->re.bm.fail[k] > n - j) 3034 a->re.bm.fail[k] = n - j; 3035 } 3036 cut: ; 3037 } 3038 for (i = 1; i <= n; i++) 3039 if (a->re.bm.fail[i] > n + k - i) 3040 { 3041 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0)); 3042 a->re.bm.fail[i] = n + k - i; 3043 } 3044 #if _AST_REGEX_DEBUG 3045 if (DEBUG_TEST(0x0020,(1),(0))) 3046 { 3047 sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0); 3048 for (m = 0; m < n; m++) 3049 for (i = 1; i <= UCHAR_MAX; i++) 3050 if (a->re.bm.mask[m][i]) 3051 sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]); 3052 for (i = ' '; i <= UCHAR_MAX; i++) 3053 if (a->re.bm.skip[i] >= HIT) 3054 sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i); 3055 else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n) 3056 sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]); 3057 for (j = 31; j >= 0; j--) 3058 { 3059 sfprintf(sfstderr, " "); 3060 next: 3061 for (m = 0; m < n; m++) 3062 { 3063 for (i = 0040; i < 0177; i++) 3064 if (a->re.bm.mask[m][i] & (1 << j)) 3065 { 3066 sfprintf(sfstderr, " %c", i); 3067 break; 3068 } 3069 if (i >= 0177) 3070 { 3071 if (j > 0) 3072 { 3073 j--; 3074 goto next; 3075 } 3076 sfprintf(sfstderr, " ?"); 3077 } 3078 } 3079 sfprintf(sfstderr, "\n"); 3080 } 3081 sfprintf(sfstderr, "FAIL: "); 3082 for (m = 1; m <= n; m++) 3083 sfprintf(sfstderr, "%3d", a->re.bm.fail[m]); 3084 sfprintf(sfstderr, "\n"); 3085 } 3086 #endif 3087 alloc(env->disc, q, 0); 3088 a->next = e; 3089 p->env->rex = a; 3090 return 0; 3091 } 3092 switch (e->type) 3093 { 3094 case REX_BEG: 3095 if (env->flags & REG_NEWLINE) 3096 return 0; 3097 break; 3098 case REX_GROUP: 3099 if (env->stats.b) 3100 return 0; 3101 e = e->re.group.expr.rex; 3102 if (e->type != REX_DOT) 3103 return 0; 3104 /*FALLTHROUGH*/ 3105 case REX_DOT: 3106 if (e->lo == 0 && e->hi == RE_DUP_INF) 3107 break; 3108 return 0; 3109 case REX_NULL: 3110 if (env->flags & REG_NULL) 3111 break; 3112 env->error = REG_ENULL; 3113 return 1; 3114 case REX_STRING: 3115 if (env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) 3116 return 0; 3117 s = e->re.string.base; 3118 n = e->re.string.size; 3119 if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1)))) 3120 return 1; 3121 a->flags = e->flags; 3122 a->map = e->map; 3123 f = a->re.string.fail; 3124 memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n); 3125 s = a->re.string.base; 3126 a->re.string.size = n; 3127 f[0] = m = -1; 3128 for (k = 1; k < n; k++) 3129 { 3130 while (m >= 0 && s[m+1] != s[k]) 3131 m = f[m]; 3132 if (s[m+1] == s[k]) 3133 m++; 3134 f[k] = m; 3135 } 3136 a->next = e->next; 3137 p->env->rex = a; 3138 e->next = 0; 3139 drop(env->disc, e); 3140 break; 3141 default: 3142 return 0; 3143 } 3144 } 3145 p->env->once = 1; 3146 return 0; 3147 } 3148 3149 int 3150 regcomp(regex_t* p, const char* pattern, regflags_t flags) 3151 { 3152 Rex_t* e; 3153 Rex_t* f; 3154 regdisc_t* disc; 3155 unsigned char* fold; 3156 int i; 3157 Cenv_t env; 3158 3159 if (!p) 3160 return REG_BADPAT; 3161 if (flags & REG_DISCIPLINE) 3162 { 3163 flags &= ~REG_DISCIPLINE; 3164 disc = p->re_disc; 3165 } 3166 else 3167 disc = &state.disc; 3168 if (!disc->re_errorlevel) 3169 disc->re_errorlevel = 2; 3170 p->env = 0; 3171 if (!pattern) 3172 return fatal(disc, REG_BADPAT, pattern); 3173 if (!state.initialized) 3174 { 3175 state.initialized = 1; 3176 for (i = 0; i < elementsof(state.escape); i++) 3177 state.magic[state.escape[i].key] = state.escape[i].val; 3178 } 3179 if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data)) 3180 { 3181 if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1))) 3182 return fatal(disc, REG_ESPACE, pattern); 3183 for (i = 0; i <= UCHAR_MAX; i++) 3184 fold[i] = toupper(i); 3185 LCINFO(AST_LC_CTYPE)->data = (void*)fold; 3186 } 3187 again: 3188 if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t)))) 3189 return fatal(disc, REG_ESPACE, pattern); 3190 memset(p->env, 0, sizeof(*p->env)); 3191 memset(&env, 0, sizeof(env)); 3192 env.regex = p; 3193 env.flags = flags; 3194 env.disc = p->env->disc = disc; 3195 if (env.flags & REG_AUGMENTED) 3196 env.flags |= REG_EXTENDED; 3197 env.mappeddot = '.'; 3198 env.mappednewline = '\n'; 3199 env.mappedslash = '/'; 3200 if (disc->re_version >= REG_VERSION_MAP && disc->re_map) 3201 { 3202 env.map = disc->re_map; 3203 env.MAP = p->env->fold; 3204 for (i = 0; i <= UCHAR_MAX; i++) 3205 { 3206 env.MAP[i] = fold[env.map[i]]; 3207 if (env.map[i] == '.') 3208 env.mappeddot = i; 3209 if (env.map[i] == '\n') 3210 env.mappednewline = i; 3211 if (env.map[i] == '/') 3212 env.mappedslash = i; 3213 } 3214 } 3215 else 3216 env.MAP = fold; 3217 env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE; 3218 env.explicit = -1; 3219 if (env.flags & REG_SHELL) 3220 { 3221 if (env.flags & REG_SHELL_PATH) 3222 env.explicit = env.mappedslash; 3223 env.flags |= REG_LENIENT|REG_NULL; 3224 env.type = env.type == BRE ? SRE : KRE; 3225 } 3226 if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE) 3227 env.explicit = env.mappednewline; 3228 p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1; 3229 env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL)); 3230 env.token.lex = 0; 3231 env.token.push = 0; 3232 if (env.flags & REG_DELIMITED) 3233 { 3234 switch (env.delimiter = *pattern++) 3235 { 3236 case 0: 3237 case '\\': 3238 case '\n': 3239 case '\r': 3240 env.error = REG_EDELIM; 3241 goto bad; 3242 } 3243 env.terminator = '\n'; 3244 } 3245 env.literal = env.pattern = env.cursor = (unsigned char*)pattern; 3246 if (!(p->env->rex = alt(&env, 1, 0))) 3247 goto bad; 3248 if (env.parnest) 3249 { 3250 env.error = REG_EPAREN; 3251 goto bad; 3252 } 3253 p->env->stats.re_flags = env.flags & (REG_EXTENDED|REG_AUGMENTED|REG_SHELL); 3254 if (env.flags & REG_LEFT) 3255 { 3256 if (p->env->rex->type != REX_BEG) 3257 { 3258 if (p->env->rex->type == REX_ALT) 3259 env.flags &= ~REG_FIRST; 3260 if (!(e = node(&env, REX_BEG, 0, 0, 0))) 3261 { 3262 regfree(p); 3263 return fatal(disc, REG_ESPACE, pattern); 3264 } 3265 e->next = p->env->rex; 3266 p->env->rex = e; 3267 p->env->once = 1; 3268 } 3269 p->env->stats.re_flags |= REG_LEFT; 3270 } 3271 for (e = p->env->rex; e->next; e = e->next); 3272 p->env->done.type = REX_DONE; 3273 p->env->done.flags = e->flags; 3274 if (env.flags & REG_RIGHT) 3275 { 3276 if (e->type != REX_END) 3277 { 3278 if (p->env->rex->type == REX_ALT) 3279 env.flags &= ~REG_FIRST; 3280 if (!(f = node(&env, REX_END, 0, 0, 0))) 3281 { 3282 regfree(p); 3283 return fatal(disc, REG_ESPACE, pattern); 3284 } 3285 f->flags = e->flags; 3286 f->map = e->map; 3287 e->next = f; 3288 } 3289 p->env->stats.re_flags |= REG_RIGHT; 3290 } 3291 if (stats(&env, p->env->rex)) 3292 { 3293 if (!env.error) 3294 env.error = REG_ECOUNT; 3295 goto bad; 3296 } 3297 if (env.stats.b) 3298 p->env->hard = p->env->separate = 1; 3299 else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c))) 3300 p->env->hard = 1; 3301 if (p->env->hard || env.stats.c || env.stats.i) 3302 p->env->stats.re_min = p->env->stats.re_max = -1; 3303 else 3304 { 3305 if (!(p->env->stats.re_min = env.stats.m)) 3306 p->env->stats.re_min = -1; 3307 if (!(p->env->stats.re_max = env.stats.n)) 3308 p->env->stats.re_max = -1; 3309 } 3310 if (special(&env, p)) 3311 goto bad; 3312 serialize(&env, p->env->rex, 1); 3313 p->re_nsub = env.stats.p; 3314 if (env.type == KRE) 3315 p->re_nsub /= 2; 3316 if (env.flags & REG_DELIMITED) 3317 { 3318 p->re_npat = env.cursor - env.pattern + 1; 3319 if (*env.cursor == env.delimiter) 3320 p->re_npat++; 3321 else if (env.flags & REG_MUSTDELIM) 3322 { 3323 env.error = REG_EDELIM; 3324 goto bad; 3325 } 3326 else 3327 env.flags &= ~REG_DELIMITED; 3328 } 3329 p->env->explicit = env.explicit; 3330 p->env->flags = env.flags & REG_COMP; 3331 p->env->min = env.stats.m; 3332 p->env->nsub = env.stats.p + env.stats.u; 3333 p->env->refs = 1; 3334 return 0; 3335 bad: 3336 regfree(p); 3337 if (!env.error) 3338 env.error = REG_ESPACE; 3339 if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL)) 3340 { 3341 flags |= REG_LITERAL; 3342 pattern = (const char*)env.literal; 3343 goto again; 3344 } 3345 return fatal(disc, env.error, pattern); 3346 } 3347 3348 /* 3349 * regcomp() on sized pattern 3350 * the lazy way around adding and checking env.end 3351 */ 3352 3353 int 3354 regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags) 3355 { 3356 char* s; 3357 int r; 3358 3359 if (!(s = malloc(size + 1))) 3360 return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern); 3361 memcpy(s, pattern, size); 3362 s[size] = 0; 3363 r = regcomp(p, s, flags); 3364 free(s); 3365 return r; 3366 } 3367 3368 /* 3369 * combine two compiled regular expressions if possible, 3370 * replacing first with the combination and freeing second. 3371 * return 0 on success. 3372 * the only combinations handled are building a Trie 3373 * from String|Kmp|Trie and String|Kmp 3374 */ 3375 3376 int 3377 regcomb(regex_t* p, regex_t* q) 3378 { 3379 Rex_t* e = p->env->rex; 3380 Rex_t* f = q->env->rex; 3381 Rex_t* g; 3382 Cenv_t env; 3383 3384 if (!e || !f) 3385 return fatal(p->env->disc, REG_BADPAT, NiL); 3386 if (p->env->separate || q->env->separate) 3387 return REG_ESUBREG; 3388 memset(&env, 0, sizeof(env)); 3389 env.disc = p->env->disc; 3390 if (e->type == REX_BM) 3391 { 3392 p->env->rex = e->next; 3393 e->next = 0; 3394 drop(env.disc, e); 3395 e = p->env->rex; 3396 } 3397 if (f->type == REX_BM) 3398 { 3399 q->env->rex = f->next; 3400 f->next = 0; 3401 drop(env.disc, f); 3402 f = q->env->rex; 3403 } 3404 if (e->type == REX_BEG && f->type == REX_BEG) 3405 { 3406 p->env->flags |= REG_LEFT; 3407 p->env->rex = e->next; 3408 e->next = 0; 3409 drop(env.disc, e); 3410 e = p->env->rex; 3411 q->env->rex = f->next; 3412 f->next = 0; 3413 drop(env.disc, f); 3414 f = q->env->rex; 3415 } 3416 if (e->next && e->next->type == REX_END && f->next && f->next->type == REX_END) 3417 { 3418 p->env->flags |= REG_RIGHT; 3419 drop(env.disc, e->next); 3420 e->next = 0; 3421 drop(env.disc, f->next); 3422 f->next = 0; 3423 } 3424 if (!(g = trie(&env, f, e))) 3425 return fatal(p->env->disc, REG_BADPAT, NiL); 3426 p->env->rex = g; 3427 if (!q->env->once) 3428 p->env->once = 0; 3429 q->env->rex = 0; 3430 if (p->env->flags & REG_LEFT) 3431 { 3432 if (!(e = node(&env, REX_BEG, 0, 0, 0))) 3433 { 3434 regfree(p); 3435 return fatal(p->env->disc, REG_ESPACE, NiL); 3436 } 3437 e->next = p->env->rex; 3438 p->env->rex = e; 3439 p->env->once = 1; 3440 } 3441 if (p->env->flags & REG_RIGHT) 3442 { 3443 for (f = p->env->rex; f->next; f = f->next); 3444 if (f->type != REX_END) 3445 { 3446 if (!(e = node(&env, REX_END, 0, 0, 0))) 3447 { 3448 regfree(p); 3449 return fatal(p->env->disc, REG_ESPACE, NiL); 3450 } 3451 f->next = e; 3452 } 3453 } 3454 env.explicit = p->env->explicit; 3455 env.flags = p->env->flags; 3456 env.disc = p->env->disc; 3457 if (stats(&env, p->env->rex)) 3458 { 3459 regfree(p); 3460 return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL); 3461 } 3462 if (special(&env, p)) 3463 { 3464 regfree(p); 3465 return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL); 3466 } 3467 p->env->min = g->re.trie.min; 3468 return 0; 3469 } 3470 3471 /* 3472 * copy a reference of p into q 3473 * p and q may then have separate regsubcomp() instantiations 3474 */ 3475 3476 int 3477 regdup(regex_t* p, regex_t* q) 3478 { 3479 if (!p || !q) 3480 return REG_BADPAT; 3481 *q = *p; 3482 p->env->refs++; 3483 q->re_sub = 0; 3484 return 0; 3485 } 3486