1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2008 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * posix regex compiler 26 */ 27 28 #include "reglib.h" 29 30 #if _PACKAGE_ast 31 #include "lclib.h" 32 #endif 33 34 #define serialize re_serialize /* hp.ia64 <unistd.h>! */ 35 36 #define C_ESC (-1) 37 #define C_MB (-2) 38 39 #if _AST_REGEX_DEBUG 40 41 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) 42 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) 43 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0) 44 45 static unsigned long debug; 46 static unsigned long debug_flag; 47 48 #else 49 50 #define DEBUG_INIT() 51 #define DEBUG_TEST(f,y,n) (n) 52 #define DEBUG_CODE(f,y,n) do {n} while(0) 53 54 #endif 55 56 #if _PACKAGE_ast 57 58 typedef struct Cchr_s 59 { 60 Dtlink_t lnk; 61 unsigned char nam[2]; 62 Ckey_t key; 63 } Cchr_t; 64 65 #endif 66 67 #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0) 68 69 /* 70 * determine whether greedy matching will work, i.e. produce 71 * the best match first. such expressions are "easy", and 72 * need no backtracking once a complete match is found. 73 * if an expression has backreferences or alts it's hard 74 * else if it has only one closure it's easy 75 * else if all closures are simple (i.e. one-character) it's easy 76 * else it's hard. 77 */ 78 79 typedef struct Stats_s 80 { 81 unsigned long l; /* min length to left of x */ 82 unsigned long k; /* min length to left of y */ 83 unsigned long m; /* min length */ 84 unsigned long n; /* max length */ 85 unsigned short a; /* number of alternations */ 86 unsigned short b; /* number of backrefs */ 87 unsigned short c; /* number of closures */ 88 unsigned short i; /* number of negations */ 89 unsigned short p; /* number of named subexpressions */ 90 unsigned short s; /* number of simple closures */ 91 unsigned short t; /* number of tries */ 92 unsigned short u; /* number of unnamed subexpressions */ 93 Rex_t* x; /* max length REX_STRING */ 94 Rex_t* y; /* max length REX_TRIE */ 95 } Stats_t; 96 97 typedef struct Token_s 98 { 99 unsigned long min; 100 unsigned long max; 101 short lex; 102 short len; 103 short esc; 104 short att; 105 short push; 106 } Token_t; 107 108 typedef struct Cenv_s 109 { 110 int delimiter; /* pattern delimiter */ 111 int error; /* last error */ 112 int explicit; /* explicit match on this char */ 113 int mappeddot; /* inverse mapped '.' */ 114 int mappednewline; /* inverse mapped '\n' */ 115 int mappedslash; /* inverse mapped '/' */ 116 regflags_t flags; /* flags arg to regcomp */ 117 int type; /* BRE,ERE,ARE,SRE,KRE */ 118 unsigned char* cursor; /* curent point in re */ 119 unsigned char* pattern; /* the original pattern */ 120 unsigned char* literal; /* literal restart pattern */ 121 int parno; /* number of last open paren */ 122 int parnest; /* paren nest count */ 123 int posixkludge; /* to make * nonspecial */ 124 Token_t token; /* token lookahead */ 125 Stats_t stats; /* RE statistics */ 126 int terminator; /* pattern terminator */ 127 Rex_t* paren[2*(BACK_REF_MAX+2)]; 128 /* paren[i]!=0 if \i defined */ 129 regex_t* regex; /* user handle */ 130 regdisc_t* disc; /* user discipline */ 131 unsigned char* map; /* external to native ccode map */ 132 unsigned char* MAP; /* fold and/or map */ 133 } Cenv_t; 134 135 /* 136 * allocate a new Rex_t node 137 */ 138 139 static Rex_t* 140 node(Cenv_t* env, int type, int lo, int hi, size_t extra) 141 { 142 register Rex_t* e; 143 144 if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra)) 145 { 146 memset(e, 0, sizeof(Rex_t) + extra); 147 e->type = type; 148 e->marked = 0; 149 e->lo = lo; 150 e->hi = hi; 151 e->flags = env->flags; 152 e->map = (e->flags & REG_ICASE) ? env->MAP : env->map; 153 e->explicit = env->explicit; 154 if (extra) 155 e->re.data = (char*)e + sizeof(Rex_t); 156 } 157 return e; 158 } 159 160 /* 161 * free a Trie_node_t node 162 */ 163 164 static void 165 triedrop(regdisc_t* disc, Trie_node_t* e) 166 { 167 if (e) 168 { 169 triedrop(disc, e->son); 170 triedrop(disc, e->sib); 171 alloc(disc, e, 0); 172 } 173 } 174 175 /* 176 * free a Rex_t node 177 */ 178 179 void 180 drop(regdisc_t* disc, Rex_t* e) 181 { 182 int i; 183 Rex_t* x; 184 185 if (e && !(disc->re_flags & REG_NOFREE)) 186 do 187 { 188 switch (e->type) 189 { 190 case REX_ALT: 191 case REX_CONJ: 192 drop(disc, e->re.group.expr.binary.left); 193 drop(disc, e->re.group.expr.binary.right); 194 break; 195 case REX_GROUP: 196 case REX_GROUP_AHEAD: 197 case REX_GROUP_AHEAD_NOT: 198 case REX_GROUP_BEHIND: 199 case REX_GROUP_BEHIND_NOT: 200 case REX_GROUP_CUT: 201 case REX_NEG: 202 case REX_REP: 203 drop(disc, e->re.group.expr.rex); 204 break; 205 case REX_TRIE: 206 for (i = 0; i <= UCHAR_MAX; i++) 207 triedrop(disc, e->re.trie.root[i]); 208 break; 209 } 210 x = e->next; 211 alloc(disc, e, 0); 212 } while (e = x); 213 } 214 215 /* 216 * mark e and descendants minimal 217 */ 218 219 static void 220 mark(register Rex_t* e, int set) 221 { 222 if (e && !e->marked) 223 do 224 { 225 e->marked = 1; 226 if (set) 227 e->flags |= REG_MINIMAL; 228 else 229 e->flags &= ~REG_MINIMAL; 230 switch (e->type) 231 { 232 case REX_ALT: 233 case REX_CONJ: 234 case REX_GROUP_COND: 235 if (e->re.group.expr.binary.left) 236 mark(e->re.group.expr.binary.left, set); 237 if (e->re.group.expr.binary.right) 238 mark(e->re.group.expr.binary.right, set); 239 break; 240 case REX_GROUP: 241 case REX_GROUP_AHEAD: 242 case REX_GROUP_AHEAD_NOT: 243 case REX_GROUP_BEHIND: 244 case REX_GROUP_BEHIND_NOT: 245 case REX_GROUP_CUT: 246 case REX_NEG: 247 case REX_REP: 248 case REX_TRIE: 249 mark(e->re.group.expr.rex, set); 250 break; 251 } 252 } while (e = e->next); 253 } 254 255 /* 256 * assign subexpression numbers by a preorder tree walk 257 */ 258 259 static int 260 serialize(Cenv_t* env, Rex_t* e, int n) 261 { 262 do 263 { 264 e->serial = n++; 265 switch (e->type) 266 { 267 case REX_ALT: 268 case REX_GROUP_COND: 269 if (e->re.group.expr.binary.left) 270 n = serialize(env, e->re.group.expr.binary.left, n); 271 e->re.group.expr.binary.serial = n++; 272 if (e->re.group.expr.binary.right) 273 n = serialize(env, e->re.group.expr.binary.right, n); 274 break; 275 case REX_CONJ: 276 n = serialize(env, e->re.group.expr.binary.left, n); 277 n = serialize(env, e->re.group.expr.binary.right, n); 278 break; 279 case REX_GROUP: 280 case REX_GROUP_AHEAD: 281 case REX_GROUP_AHEAD_NOT: 282 case REX_GROUP_BEHIND: 283 case REX_GROUP_BEHIND_NOT: 284 case REX_GROUP_CUT: 285 case REX_NEG: 286 case REX_REP: 287 n = serialize(env, e->re.group.expr.rex, n); 288 break; 289 } 290 } while (e = e->next); 291 return n; 292 } 293 294 /* 295 * catenate e and f into a sequence, collapsing them if possible 296 */ 297 298 static Rex_t* 299 cat(Cenv_t* env, Rex_t* e, Rex_t* f) 300 { 301 Rex_t* g; 302 303 if (!f) 304 { 305 drop(env->disc, e); 306 return 0; 307 } 308 if (e->type == REX_NULL) 309 { 310 drop(env->disc, e); 311 return f; 312 } 313 if (f->type == REX_NULL) 314 { 315 g = f->next; 316 f->next = 0; 317 drop(env->disc, f); 318 f = g; 319 } 320 else if (e->type == REX_DOT && f->type == REX_DOT) 321 { 322 unsigned int m = e->lo + f->lo; 323 unsigned int n = e->hi + f->hi; 324 325 if (m <= RE_DUP_MAX) 326 { 327 if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX) 328 { 329 n = RE_DUP_INF; 330 goto combine; 331 } 332 else if (n <= RE_DUP_MAX) 333 { 334 combine: 335 e->lo = m; 336 e->hi = n; 337 g = f->next; 338 f->next = 0; 339 drop(env->disc, f); 340 f = g; 341 } 342 } 343 } 344 e->next = f; 345 return e; 346 } 347 348 /* 349 * collect re statistics 350 */ 351 352 static int 353 stats(register Cenv_t* env, register Rex_t* e) 354 { 355 register unsigned long n; 356 register unsigned long m; 357 unsigned long cm; 358 unsigned long nm; 359 unsigned long cn; 360 unsigned long nn; 361 unsigned long l; 362 unsigned long k; 363 unsigned long t; 364 Rex_t* q; 365 Rex_t* x; 366 Rex_t* y; 367 unsigned char c; 368 unsigned char b; 369 370 do 371 { 372 switch (e->type) 373 { 374 case REX_ALT: 375 x = env->stats.x; 376 l = env->stats.l; 377 y = env->stats.y; 378 k = env->stats.k; 379 t = env->stats.t; 380 if (++env->stats.a <= 0) 381 return 1; 382 cm = env->stats.m; 383 env->stats.m = 0; 384 cn = env->stats.n; 385 env->stats.n = 0; 386 if (stats(env, e->re.group.expr.binary.left)) 387 return 1; 388 m = env->stats.m; 389 env->stats.m = 0; 390 n = env->stats.n; 391 env->stats.n = 0; 392 if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right)) 393 return 1; 394 if (env->stats.m > m) 395 env->stats.m = m; 396 else 397 m = env->stats.m; 398 if ((env->stats.m += cm) < m) 399 return 1; 400 if (env->stats.n < n) 401 env->stats.n = n; 402 else 403 n = env->stats.n; 404 if ((env->stats.n += cn) < n) 405 return 1; 406 env->stats.x = x; 407 env->stats.l = l; 408 env->stats.y = y; 409 env->stats.k = k; 410 env->stats.t = t; 411 break; 412 case REX_BACK: 413 if (++env->stats.b <= 0) 414 return 1; 415 break; 416 case REX_CLASS: 417 case REX_COLL_CLASS: 418 case REX_DOT: 419 case REX_ONECHAR: 420 n = env->stats.m; 421 if ((env->stats.m += e->lo) < n) 422 return 1; 423 if (e->hi != RE_DUP_INF) 424 { 425 n = env->stats.n; 426 if ((env->stats.n += e->hi) < n) 427 return 1; 428 } 429 if (e->lo != e->hi) 430 { 431 if (++env->stats.c <= 0) 432 return 1; 433 if (++env->stats.s <= 0) 434 return 1; 435 } 436 break; 437 case REX_CONJ: 438 cm = env->stats.m; 439 env->stats.m = 0; 440 cn = env->stats.n; 441 env->stats.n = 0; 442 if (stats(env, e->re.group.expr.binary.left)) 443 return 1; 444 nm = env->stats.m; 445 env->stats.m = 0; 446 nn = env->stats.n; 447 env->stats.n = 0; 448 if (stats(env, e->re.group.expr.binary.right)) 449 return 1; 450 if (env->stats.m < nm) 451 env->stats.m = nm; 452 else 453 nm = env->stats.m; 454 if ((env->stats.m += cm) < nm) 455 return 1; 456 if (env->stats.n < nn) 457 env->stats.n = nn; 458 else 459 nn = env->stats.n; 460 if ((env->stats.n += cn) < nn) 461 return 1; 462 break; 463 case REX_GROUP: 464 if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0) 465 return 1; 466 if (stats(env, e->re.group.expr.rex)) 467 return 1; 468 break; 469 case REX_GROUP_AHEAD: 470 case REX_GROUP_AHEAD_NOT: 471 case REX_GROUP_BEHIND: 472 case REX_GROUP_BEHIND_NOT: 473 m = env->stats.m; 474 n = env->stats.n; 475 x = env->stats.x; 476 y = env->stats.y; 477 if (stats(env, e->re.group.expr.rex)) 478 return 1; 479 env->stats.m = m; 480 env->stats.n = n; 481 env->stats.x = x; 482 env->stats.y = y; 483 switch (e->type) 484 { 485 case REX_GROUP_AHEAD: 486 case REX_GROUP_BEHIND: 487 if (++env->stats.u <= 0) 488 return 1; 489 break; 490 } 491 break; 492 case REX_GROUP_COND: 493 if (++env->stats.u <= 0) 494 return 1; 495 m = env->stats.m; 496 n = env->stats.n; 497 x = env->stats.x; 498 y = env->stats.y; 499 if (e->re.group.size > 0 && ++env->stats.b <= 0) 500 return 1; 501 if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left)) 502 return 1; 503 if (q = e->re.group.expr.binary.right) 504 { 505 if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left)) 506 return 1; 507 if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right)) 508 return 1; 509 } 510 env->stats.m = m; 511 env->stats.n = n; 512 env->stats.x = x; 513 env->stats.y = y; 514 break; 515 case REX_GROUP_CUT: 516 if (++env->stats.u <= 0) 517 return 1; 518 m = env->stats.m; 519 n = env->stats.n; 520 x = env->stats.x; 521 y = env->stats.y; 522 if (stats(env, e->re.group.expr.rex)) 523 return 1; 524 env->stats.m = m; 525 env->stats.n = n; 526 env->stats.x = x; 527 env->stats.y = y; 528 break; 529 case REX_NEG: 530 env->stats.i++; 531 x = env->stats.x; 532 l = env->stats.l; 533 y = env->stats.y; 534 k = env->stats.k; 535 t = env->stats.t; 536 cm = env->stats.m; 537 env->stats.m = 0; 538 if (stats(env, e->re.group.expr.rex)) 539 return 1; 540 env->stats.m = !env->stats.m; 541 if ((env->stats.m += cm) < cm) 542 return 1; 543 env->stats.x = x; 544 env->stats.l = l; 545 env->stats.y = y; 546 env->stats.k = k; 547 env->stats.t = t; 548 break; 549 case REX_REP: 550 x = env->stats.x; 551 l = env->stats.l; 552 y = env->stats.y; 553 k = env->stats.k; 554 t = env->stats.t; 555 if (++env->stats.c <= 0) 556 return 1; 557 b = env->stats.b; 558 c = env->stats.c; 559 cm = env->stats.m; 560 env->stats.m = 0; 561 if (stats(env, e->re.group.expr.rex)) 562 return 1; 563 if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0) 564 return 1; 565 if (e->lo < 1) 566 { 567 env->stats.x = x; 568 env->stats.l = l; 569 env->stats.y = y; 570 env->stats.k = k; 571 env->stats.t = t; 572 env->stats.m = cm; 573 } 574 else 575 { 576 m = env->stats.m; 577 if ((env->stats.m *= e->lo) > 0 && env->stats.m < m) 578 return 1; 579 m = env->stats.m; 580 if ((env->stats.m += cm) < m) 581 return 1; 582 if (env->stats.x != x) 583 env->stats.l = cm; 584 if (env->stats.y != y) 585 env->stats.k = cm; 586 } 587 break; 588 case REX_STRING: 589 if (!e->map) 590 { 591 cm = env->stats.m; 592 if ((env->stats.m += e->re.string.size) < cm) 593 return 1; 594 cn = env->stats.n; 595 if ((env->stats.n += e->re.string.size) < cn) 596 return 1; 597 if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size) 598 { 599 env->stats.x = e; 600 env->stats.l = cm; 601 } 602 } 603 break; 604 case REX_TRIE: 605 if (++env->stats.s <= 0) 606 return 1; 607 cm = env->stats.m; 608 if ((env->stats.m += e->re.trie.min) < cm) 609 return 1; 610 cn = env->stats.n; 611 if ((env->stats.n += e->re.trie.max) < cn) 612 return 1; 613 env->stats.t++; 614 if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min) 615 { 616 env->stats.y = e; 617 env->stats.k = cm; 618 } 619 break; 620 } 621 } while (e = e->next); 622 return 0; 623 } 624 625 static int token(Cenv_t*); 626 627 static int 628 magic(register Cenv_t* env, register int c, int escaped) 629 { 630 register char* sp; 631 register int n; 632 int o = c; 633 int e = env->error; 634 int l = env->token.len; 635 short* mp; 636 char* ep; 637 638 if (mp = state.magic[c]) 639 { 640 c = mp[env->type+escaped]; 641 if (c >= T_META) 642 { 643 sp = (char*)env->cursor + env->token.len; 644 switch (c) 645 { 646 case T_LEFT: 647 n = 0; 648 ep = sp; 649 while (*sp >= '0' && *sp <= '9') 650 { 651 if (n > (INT_MAX / 10)) 652 { 653 env->error = REG_BADBR; 654 goto bad; 655 } 656 n = n * 10 + *sp++ - '0'; 657 } 658 if (sp == ep) 659 { 660 if (env->type < SRE || *sp != ',') 661 { 662 env->error = *sp ? REG_BADBR : REG_EBRACE; 663 goto bad; 664 } 665 } 666 else if (n > RE_DUP_MAX) 667 { 668 env->error = REG_BADBR; 669 goto bad; 670 } 671 env->token.min = n; 672 if (*sp == ',') 673 { 674 n = 0; 675 ep = ++sp; 676 while (*sp >= '0' && *sp <= '9') 677 { 678 if (n > (INT_MAX / 10)) 679 { 680 env->error = REG_BADBR; 681 goto bad; 682 } 683 n = n * 10 + *sp++ - '0'; 684 } 685 if (sp == ep) 686 n = RE_DUP_INF; 687 else if (n < env->token.min) 688 { 689 env->error = REG_BADBR; 690 goto bad; 691 } 692 } 693 env->token.max = n; 694 switch (*sp) 695 { 696 case 0: 697 env->error = REG_EBRACE; 698 goto bad; 699 case '\\': 700 if (!escaped) 701 { 702 env->error = REG_BADBR; 703 goto bad; 704 } 705 sp++; 706 break; 707 default: 708 if (escaped) 709 { 710 env->error = REG_BADBR; 711 goto bad; 712 } 713 break; 714 } 715 switch (*sp++) 716 { 717 case 0: 718 env->error = REG_EBRACE; 719 goto bad; 720 case '}': 721 break; 722 default: 723 env->error = REG_BADBR; 724 goto bad; 725 } 726 env->token.len = sp - (char*)env->cursor; 727 break; 728 case T_RIGHT: 729 env->error = REG_EBRACE; 730 goto bad; 731 case T_OPEN: 732 if (env->type < SRE && *sp == '?') 733 { 734 env->token.len++; 735 env->token.lex = 0; 736 goto group; 737 } 738 break; 739 case T_ESCAPE: 740 c = chresc(sp - 2, &ep); 741 if (ep < sp) 742 goto bad; 743 env->token.len += ep - sp; 744 if (c >= T_META) 745 { 746 env->token.lex = c; 747 c = C_ESC; 748 } 749 return c; 750 case T_BACK+0: 751 case T_BACK+1: 752 case T_BACK+2: 753 case T_BACK+3: 754 case T_BACK+4: 755 case T_BACK+5: 756 case T_BACK+6: 757 case T_BACK+7: 758 n = chresc(sp - 2, &ep); 759 if (ep > sp + 1) 760 { 761 env->token.len += ep - sp; 762 return n; 763 } 764 /*FALLTHROUGH*/ 765 case T_BACK+8: 766 case T_BACK+9: 767 if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT)) 768 { 769 env->error = REG_BADESC; 770 goto bad; 771 } 772 if ((env->flags & REG_MULTIREF) && isdigit(*sp)) 773 { 774 c = (c - T_BACK) * 10 + (*sp - '0'); 775 if (c > 0 && c <= env->parno && env->paren[c]) 776 c += T_BACK; 777 else 778 c = chresc(sp - 2, &ep); 779 env->token.len++; 780 } 781 if (c == T_BACK) 782 c = 0; 783 break; 784 case T_BAD: 785 if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META) 786 return c; 787 goto bad; 788 } 789 if (env->type >= SRE) 790 { 791 if (c == T_DOT) 792 c = '.'; 793 else if (c < T_OPEN) 794 { 795 if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(') 796 { 797 env->token.len++; 798 env->token.att = 1; 799 } 800 if (env->type == KRE && *(env->cursor + env->token.len) == '(') 801 { 802 env->token.len++; 803 switch (c) 804 { 805 case T_AT: 806 break; 807 case T_PERCENT: 808 env->token.lex = c; 809 goto group; 810 case T_TILDE: 811 env->token.lex = 0; 812 goto group; 813 default: 814 env->token.lex = c; 815 break; 816 } 817 c = T_OPEN; 818 } 819 else if (c == T_STAR) 820 c = T_DOTSTAR; 821 else if (c == T_QUES) 822 c = T_DOT; 823 else 824 { 825 c = o; 826 env->token.len = l; 827 } 828 } 829 else if (c > T_BACK) 830 { 831 c = (c - T_BACK) * 2 - 1; 832 c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c; 833 } 834 else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP)) 835 { 836 if (c == T_AND) 837 c = '&'; 838 else if (c == T_BAR) 839 c = '|'; 840 else if (c == T_OPEN) 841 c = '('; 842 } 843 } 844 } 845 } 846 else if (escaped == 2) 847 { 848 if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter)) 849 /*ok*/; 850 else 851 { 852 env->error = REG_BADESC; 853 goto bad; 854 } 855 } 856 else if (escaped && !(env->flags & REG_LENIENT) && c != ']') 857 { 858 env->error = REG_BADESC; 859 goto bad; 860 } 861 return c; 862 group: 863 sp = (char*)env->cursor + env->token.len; 864 switch (*sp++) 865 { 866 case ')': 867 break; 868 case '#': 869 for (;;) 870 { 871 switch (*sp++) 872 { 873 case 0: 874 env->error = REG_EPAREN; 875 return T_BAD; 876 case ')': 877 break; 878 default: 879 continue; 880 } 881 break; 882 } 883 break; 884 default: 885 return T_GROUP; 886 } 887 env->cursor = (unsigned char*)sp; 888 return token(env); 889 bad: 890 if (escaped == 2) 891 env->error = e; 892 else if (env->flags & REG_LENIENT) 893 return o; 894 else if (escaped == 1 && !env->error) 895 { 896 if (mp || o == ']') 897 return o; 898 env->error = REG_BADESC; 899 } 900 return T_BAD; 901 } 902 903 static int 904 token(register Cenv_t* env) 905 { 906 int c; 907 int posixkludge; 908 909 if (env->token.push) 910 return env->token.lex; 911 env->token.att = env->token.esc = 0; 912 if ((env->token.len = MBSIZE(env->cursor)) > 1) 913 return env->token.lex = C_MB; 914 env->token.lex = 0; 915 for (;;) 916 { 917 c = *env->cursor; 918 if (c == 0 || c == env->delimiter || c == env->terminator) 919 return T_END; 920 if (!(env->flags & REG_COMMENT)) 921 break; 922 if (c == '#') 923 { 924 do 925 { 926 c = *++env->cursor; 927 if (c == 0 || c == env->delimiter) 928 return T_END; 929 } while (c != '\n'); 930 } 931 else if (!isspace(c)) 932 break; 933 env->cursor++; 934 } 935 if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter) 936 { 937 if (env->parnest) 938 { 939 env->error = REG_EPAREN; 940 return T_BAD; 941 } 942 env->parno = 0; 943 env->pattern = env->cursor + 1; 944 return T_BAR; 945 } 946 if (env->flags & REG_LITERAL) 947 return c; 948 if (posixkludge = env->posixkludge) 949 { 950 env->posixkludge = 0; 951 if (c == '*') 952 return c; 953 } 954 if (c == '\\') 955 { 956 if (env->flags & REG_SHELL_ESCAPED) 957 return c; 958 if (!(c = *(env->cursor + 1)) || c == env->terminator) 959 { 960 if (env->flags & REG_LENIENT) 961 { 962 if (c) 963 { 964 env->token.esc = env->token.len; 965 env->token.len += MBSIZE(env->cursor + 1); 966 return c; 967 } 968 return '\\'; 969 } 970 env->error = REG_EESCAPE; 971 return T_BAD; 972 } 973 env->token.esc = env->token.len; 974 env->token.len += MBSIZE(env->cursor + 1); 975 if (env->delimiter && c == 'n') 976 return '\n'; 977 else if (c == env->delimiter) 978 return magic(env, c, 0); 979 else if (c == '(' && env->type == BRE) 980 env->posixkludge = 1; 981 else if (c == ')' && env->type == BRE && env->parnest <= 0) 982 { 983 env->error = REG_EPAREN; 984 return T_BAD; 985 } 986 else if (isspace(c) && (env->flags & REG_COMMENT)) 987 return c; 988 return magic(env, c, 1); 989 } 990 else if (c == '$') 991 { 992 if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n') 993 return T_DOLL; 994 } 995 else if (c == '^') 996 { 997 if (env->type == BRE && (env->cursor == env->pattern || posixkludge)) 998 { 999 env->posixkludge = 1; 1000 return T_CFLX; 1001 } 1002 } 1003 else if (c == ')') 1004 { 1005 if (env->type != BRE && env->parnest <= 0) 1006 return c; 1007 } 1008 else if (c == '/' && env->explicit == env->mappedslash) 1009 { 1010 while (*(env->cursor + env->token.len) == c) 1011 env->token.len++; 1012 return T_SLASHPLUS; 1013 } 1014 return magic(env, c, 0); 1015 } 1016 1017 static Celt_t* 1018 col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec) 1019 { 1020 register char* s; 1021 register unsigned char* k; 1022 register unsigned char* e; 1023 register int c; 1024 register int cc; 1025 int bt; 1026 int et; 1027 Ckey_t key; 1028 1029 cc = 0; 1030 for (;;) 1031 { 1032 k = key; 1033 if (bw == 1) 1034 { 1035 c = bc; 1036 if (ic) 1037 { 1038 if (isupper(c)) 1039 { 1040 c = tolower(c); 1041 cc = -1; 1042 } 1043 else if (islower(c)) 1044 { 1045 c = toupper(c); 1046 cc = -1; 1047 } 1048 } 1049 *k++ = c; 1050 } 1051 else if (bw < COLL_KEY_MAX) 1052 { 1053 s = (char*)bp; 1054 if (ic) 1055 { 1056 c = mbchar(s); 1057 if (iswupper(c)) 1058 { 1059 c = towlower(c); 1060 cc = 1; 1061 } 1062 else if (iswlower(c)) 1063 { 1064 c = towupper(c); 1065 cc = 1; 1066 } 1067 } 1068 if (cc > 0) 1069 { 1070 cc = -1; 1071 k += wctomb((char*)k, c); 1072 } 1073 else 1074 for (e = k + bw; k < e; *k++ = *s++); 1075 } 1076 *k = 0; 1077 mbxfrm(ce->beg, key, COLL_KEY_MAX); 1078 if (ep) 1079 { 1080 k = key; 1081 c = mbchar(k); 1082 if (iswupper(c)) 1083 bt = COLL_range_uc; 1084 else if (iswlower(c)) 1085 bt = COLL_range_lc; 1086 else 1087 bt = COLL_range; 1088 k = key; 1089 if (ew == 1) 1090 { 1091 c = ec; 1092 if (ic) 1093 { 1094 if (isupper(c)) 1095 { 1096 c = tolower(c); 1097 cc = -1; 1098 } 1099 else if (islower(c)) 1100 { 1101 c = toupper(c); 1102 cc = -1; 1103 } 1104 } 1105 *k++ = c; 1106 } 1107 else if (ew < COLL_KEY_MAX) 1108 { 1109 s = (char*)ep; 1110 if (ic) 1111 { 1112 c = mbchar(s); 1113 if (iswupper(c)) 1114 { 1115 c = towlower(c); 1116 cc = 1; 1117 } 1118 else if (iswlower(c)) 1119 { 1120 c = towupper(c); 1121 cc = 1; 1122 } 1123 } 1124 if (cc > 0) 1125 { 1126 cc = -1; 1127 k += wctomb((char*)k, c); 1128 } 1129 else 1130 for (e = k + ew; k < e; *k++ = *s++); 1131 } 1132 *k = 0; 1133 mbxfrm(ce->end, key, COLL_KEY_MAX); 1134 k = key; 1135 c = mbchar(k); 1136 if (iswupper(c)) 1137 et = COLL_range_uc; 1138 else if (iswlower(c)) 1139 et = COLL_range_lc; 1140 else 1141 et = COLL_range; 1142 ce->typ = bt == et ? bt : COLL_range; 1143 } 1144 else 1145 ce->typ = COLL_char; 1146 ce++; 1147 if (!ic || !cc) 1148 break; 1149 ic = 0; 1150 } 1151 return ce; 1152 } 1153 1154 static Rex_t* 1155 bra(Cenv_t* env) 1156 { 1157 Rex_t* e; 1158 int c; 1159 int i; 1160 int w; 1161 int neg; 1162 int last; 1163 int inrange; 1164 int complicated; 1165 int collate; 1166 int elements; 1167 unsigned char* first; 1168 unsigned char* start; 1169 unsigned char* begin; 1170 unsigned char* s; 1171 regclass_t f; 1172 unsigned char buf[4 * (COLL_KEY_MAX + 1)]; 1173 #if _PACKAGE_ast 1174 int ic; 1175 char mbc[COLL_KEY_MAX + 1]; 1176 #endif 1177 1178 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 1179 return 0; 1180 collate = complicated = elements = 0; 1181 if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!') 1182 { 1183 env->cursor++; 1184 neg = 1; 1185 } 1186 else 1187 neg = 0; 1188 first = env->cursor; 1189 start = first + MBSIZE(first); 1190 if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter)) 1191 goto error; 1192 begin = env->cursor + MBSIZE(env->cursor); 1193 1194 /* 1195 * inrange: 0=no, 1=possibly, 2=definitely 1196 */ 1197 1198 inrange = 0; 1199 for (;;) 1200 { 1201 if (!(c = *env->cursor) || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 1202 goto error; 1203 env->cursor += (w = MBSIZE(env->cursor)); 1204 if (c == '\\') 1205 { 1206 if (*env->cursor) 1207 { 1208 if (*env->cursor == 'n') 1209 { 1210 env->cursor++; 1211 c = '\n'; 1212 } 1213 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 1214 { 1215 env->token.len = 1; 1216 w = magic(env, *env->cursor, 2); 1217 if (env->token.len > 1 || w != T_BAD) 1218 { 1219 if (env->token.len == 1 && (f = classfun(w))) 1220 { 1221 if (inrange > 1) 1222 { 1223 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1224 goto erange; 1225 inrange = 0; 1226 } 1227 env->cursor++; 1228 for (c = 0; c <= UCHAR_MAX; c++) 1229 if ((*f)(c)) 1230 setadd(e->re.charclass, c); 1231 complicated++; 1232 elements++; 1233 continue; 1234 } 1235 if (env->token.len > 1 || w >= 0 && w < T_META) 1236 { 1237 c = w; 1238 if (c > UCHAR_MAX) 1239 { 1240 if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide()) 1241 goto erange; 1242 c = UCHAR_MAX; 1243 } 1244 env->cursor += env->token.len; 1245 } 1246 } 1247 } 1248 } 1249 } 1250 else if (c == ']') 1251 { 1252 if (env->cursor == begin) 1253 { 1254 last = c; 1255 inrange = 1; 1256 continue; 1257 } 1258 if (inrange != 0) 1259 { 1260 setadd(e->re.charclass, last); 1261 elements++; 1262 if (inrange == 2) 1263 { 1264 setadd(e->re.charclass, '-'); 1265 elements++; 1266 } 1267 } 1268 break; 1269 } 1270 else if (c == '-') 1271 { 1272 if (!inrange && env->cursor != begin && *env->cursor != ']') 1273 { 1274 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1275 goto erange; 1276 continue; 1277 } 1278 else if (inrange == 1) 1279 { 1280 inrange = 2; 1281 complicated++; 1282 continue; 1283 } 1284 } 1285 else if (c == '[') 1286 { 1287 switch (*env->cursor) 1288 { 1289 case 0: 1290 goto error; 1291 case ':': 1292 if (inrange == 1) 1293 { 1294 setadd(e->re.charclass, last); 1295 elements++; 1296 } 1297 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 1298 { 1299 if (env->cursor == start && (c = *(env->cursor + 1))) 1300 { 1301 s = start = env->cursor + 1; 1302 while (*++s && *s != ':'); 1303 if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']') 1304 { 1305 if ((i = (s - start)) == 1) 1306 { 1307 switch (c) 1308 { 1309 case '<': 1310 i = REX_WBEG; 1311 break; 1312 case '>': 1313 i = REX_WEND; 1314 break; 1315 default: 1316 i = 0; 1317 break; 1318 } 1319 if (i) 1320 { 1321 env->cursor = s + 3; 1322 drop(env->disc, e); 1323 return node(env, i, 0, 0, 0); 1324 } 1325 } 1326 } 1327 } 1328 env->error = REG_ECTYPE; 1329 goto error; 1330 } 1331 for (c = 0; c <= UCHAR_MAX; c++) 1332 if ((*f)(c)) 1333 setadd(e->re.charclass, c); 1334 inrange = 0; 1335 complicated++; 1336 elements++; 1337 continue; 1338 case '=': 1339 if (inrange == 2) 1340 goto erange; 1341 if (inrange == 1) 1342 { 1343 setadd(e->re.charclass, last); 1344 elements++; 1345 } 1346 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 1347 goto ecollate; 1348 if (c > 1) 1349 collate++; 1350 else 1351 setadd(e->re.charclass, buf[0]); 1352 c = buf[0]; 1353 inrange = 0; 1354 complicated++; 1355 elements++; 1356 continue; 1357 case '.': 1358 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 1359 goto ecollate; 1360 if (c > 1) 1361 collate++; 1362 c = buf[0]; 1363 complicated++; 1364 break; 1365 default: 1366 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 1367 goto error; 1368 break; 1369 } 1370 } 1371 else if (w > 1) 1372 complicated++; 1373 if (inrange == 2) 1374 { 1375 if (last > c) 1376 { 1377 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1378 goto erange; 1379 setadd(e->re.charclass, last); 1380 setadd(e->re.charclass, c); 1381 } 1382 else 1383 for (i = last; i <= c; i++) 1384 setadd(e->re.charclass, i); 1385 inrange = env->type >= SRE || (env->flags & REG_LENIENT); 1386 elements += 2; 1387 } 1388 else if (inrange == 1) 1389 { 1390 setadd(e->re.charclass, last); 1391 elements++; 1392 } 1393 else 1394 inrange = 1; 1395 last = c; 1396 } 1397 #if _PACKAGE_ast 1398 if (complicated && mbcoll()) 1399 { 1400 Dt_t* dt; 1401 Cchr_t* cc; 1402 Cchr_t* tc; 1403 Cchr_t* xc; 1404 Celt_t* ce; 1405 Cchr_t key; 1406 int rw; 1407 int rc; 1408 int wc; 1409 unsigned char* rp; 1410 unsigned char* pp; 1411 char* wp; 1412 char cb[2][COLL_KEY_MAX+1]; 1413 1414 static Dtdisc_t disc; 1415 1416 static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; 1417 1418 if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data)) 1419 { 1420 disc.key = offsetof(Cchr_t, key); 1421 if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dttree))) 1422 { 1423 for (i = 0; i < elementsof(primary) - 1; i++, cc++) 1424 { 1425 cc->nam[0] = primary[i]; 1426 mbxfrm(cc->key, cc->nam, COLL_KEY_MAX); 1427 dtinsert(dt, cc); 1428 } 1429 for (i = 0; i < elementsof(cc->key); i++) 1430 cc->key[i] = ~0; 1431 dtinsert(dt, cc); 1432 LCINFO(AST_LC_COLLATE)->data = (void*)dt; 1433 } 1434 else 1435 { 1436 if (cc) 1437 free(cc); 1438 drop(env->disc, e); 1439 return 0; 1440 } 1441 } 1442 if (dt) 1443 { 1444 drop(env->disc, e); 1445 if (ic = env->flags & REG_ICASE) 1446 elements *= 2; 1447 if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 2) * sizeof(Celt_t)))) 1448 return 0; 1449 ce = (Celt_t*)e->re.data; 1450 e->re.collate.invert = neg; 1451 e->re.collate.elements = ce; 1452 env->cursor = first; 1453 inrange = 0; 1454 for (;;) 1455 { 1456 if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 1457 goto error; 1458 pp = env->cursor; 1459 env->cursor += (w = MBSIZE(env->cursor)); 1460 if (c == '\\') 1461 { 1462 if (*env->cursor) 1463 { 1464 if (*env->cursor == 'n') 1465 { 1466 pp = env->cursor++; 1467 c = '\n'; 1468 } 1469 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 1470 { 1471 env->token.len = 1; 1472 w = magic(env, *env->cursor, 2); 1473 if (env->token.len > 1 || w != T_BAD) 1474 { 1475 if (env->token.len == 1 && (f = classfun(w))) 1476 { 1477 if (inrange > 1) 1478 { 1479 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1480 goto erange; 1481 inrange = 0; 1482 } 1483 env->cursor++; 1484 ce->fun = f; 1485 ce->typ = COLL_call; 1486 ce++; 1487 continue; 1488 } 1489 if (env->token.len > 1 || w >= 0 && w < T_META) 1490 { 1491 c = w; 1492 w = wctomb(mbc, c); 1493 pp = (unsigned char*)mbc; 1494 env->cursor += env->token.len; 1495 } 1496 } 1497 } 1498 } 1499 } 1500 else if (c == ']') 1501 { 1502 if (env->cursor == begin) 1503 { 1504 rp = pp; 1505 rw = w; 1506 inrange = 1; 1507 continue; 1508 } 1509 if (inrange != 0) 1510 { 1511 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1512 if (inrange == 2) 1513 ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0); 1514 } 1515 break; 1516 } 1517 else if (c == '-') 1518 { 1519 if (!inrange && env->cursor != begin && *env->cursor != ']') 1520 { 1521 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1522 goto erange; 1523 continue; 1524 } 1525 else if (inrange == 1) 1526 { 1527 inrange = 2; 1528 continue; 1529 } 1530 } 1531 else if (c == '[') 1532 { 1533 switch (*env->cursor) 1534 { 1535 case 0: 1536 goto error; 1537 case ':': 1538 if (inrange == 1) 1539 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1540 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 1541 { 1542 if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']') 1543 { 1544 switch (c) 1545 { 1546 case '<': 1547 i = REX_WBEG; 1548 break; 1549 case '>': 1550 i = REX_WEND; 1551 break; 1552 default: 1553 i = 0; 1554 break; 1555 } 1556 if (i) 1557 { 1558 env->cursor += 5; 1559 drop(env->disc, e); 1560 return node(env, i, 0, 0, 0); 1561 } 1562 } 1563 env->error = REG_ECTYPE; 1564 goto error; 1565 } 1566 ce->fun = f; 1567 ce->typ = COLL_call; 1568 ce++; 1569 inrange = 0; 1570 continue; 1571 case '=': 1572 if (inrange == 2) 1573 goto erange; 1574 if (inrange == 1) 1575 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1576 pp = (unsigned char*)cb[inrange]; 1577 rp = env->cursor + 1; 1578 if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) 1579 goto ecollate; 1580 wp = (char*)pp; 1581 wc = mbchar(wp); 1582 c = 0; 1583 if (ic) 1584 { 1585 if (iswupper(wc)) 1586 { 1587 wc = towlower(wc); 1588 rw = wctomb((char*)pp, wc); 1589 c = 'u'; 1590 } 1591 else if (iswlower(wc)) 1592 c = 'l'; 1593 } 1594 for (;;) 1595 { 1596 mbxfrm(key.key, (char*)pp, COLL_KEY_MAX); 1597 if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key))) 1598 goto ecollate; 1599 xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc; 1600 if (c == 'l' || c == 'L' && !(c = 0)) 1601 ce->typ = COLL_range_lc; 1602 else if (c == 'u' || c == 'U' && !(c = 0)) 1603 ce->typ = COLL_range_uc; 1604 else 1605 ce->typ = COLL_range; 1606 strcpy((char*)ce->beg, (char*)xc->key); 1607 if (!(cc = (Cchr_t*)dtnext(dt, cc))) 1608 goto ecollate; 1609 if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc))) 1610 cc = tc; 1611 strcpy((char*)ce->end, (char*)cc->key); 1612 ce->max = -1; 1613 ce++; 1614 if (!c) 1615 break; 1616 if (c == 'u') 1617 { 1618 wc = towlower(wc); 1619 c = 'L'; 1620 } 1621 else 1622 { 1623 wc = towupper(wc); 1624 c = 'U'; 1625 } 1626 rw = wctomb((char*)pp, wc); 1627 } 1628 inrange = 0; 1629 c = *pp; 1630 continue; 1631 case '.': 1632 pp = (unsigned char*)cb[inrange]; 1633 if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) 1634 goto ecollate; 1635 c = buf[0]; 1636 break; 1637 default: 1638 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 1639 goto error; 1640 break; 1641 } 1642 } 1643 if (inrange == 2) 1644 { 1645 ce = col(ce, ic, rp, rw, rc, pp, w, c); 1646 if (strcmp((char*)ce->beg, (char*)ce->end) > 0) 1647 { 1648 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1649 goto erange; 1650 (ce-1)->typ = COLL_char; 1651 strcpy((char*)ce->beg, (char*)(ce-1)->end); 1652 ce->typ = COLL_char; 1653 ce++; 1654 } 1655 inrange = env->type >= SRE || (env->flags & REG_LENIENT); 1656 } 1657 else if (inrange == 1) 1658 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1659 else 1660 inrange = 1; 1661 rp = pp; 1662 rw = w; 1663 rc = c; 1664 } 1665 ce->typ = COLL_end; 1666 return e; 1667 } 1668 } 1669 #endif 1670 if (collate) 1671 goto ecollate; 1672 if (env->flags & REG_ICASE) 1673 for (i = 0; i <= UCHAR_MAX; i++) 1674 if (settst(e->re.charclass, i)) 1675 { 1676 if (isupper(i)) 1677 c = tolower(i); 1678 else if (islower(i)) 1679 c = toupper(i); 1680 else 1681 continue; 1682 setadd(e->re.charclass, c); 1683 } 1684 if (neg) 1685 { 1686 for (i = 0; i < elementsof(e->re.charclass->bits); i++) 1687 e->re.charclass->bits[i] ^= ~0; 1688 if (env->explicit >= 0) 1689 setclr(e->re.charclass, env->explicit); 1690 } 1691 return e; 1692 ecollate: 1693 env->error = REG_ECOLLATE; 1694 goto error; 1695 erange: 1696 env->error = REG_ERANGE; 1697 error: 1698 drop(env->disc, e); 1699 if (!env->error) 1700 env->error = REG_EBRACK; 1701 return 0; 1702 } 1703 1704 static Rex_t* 1705 ccl(Cenv_t* env, int type) 1706 { 1707 int i; 1708 Rex_t* e; 1709 Celt_t* ce; 1710 regclass_t f; 1711 1712 if (!(f = classfun(type))) 1713 { 1714 env->error = REG_BADESC; 1715 return 0; 1716 } 1717 if (!mbcoll()) 1718 { 1719 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 1720 return 0; 1721 for (i = 0; i <= UCHAR_MAX; i++) 1722 if ((*f)(i)) 1723 setadd(e->re.charclass, i); 1724 if (env->explicit >= 0) 1725 setclr(e->re.charclass, env->explicit); 1726 } 1727 else 1728 { 1729 if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t)))) 1730 return 0; 1731 ce = (Celt_t*)e->re.data; 1732 e->re.collate.invert = 0; 1733 e->re.collate.elements = ce; 1734 ce->fun = f; 1735 ce->typ = COLL_call; 1736 ce++; 1737 ce->typ = COLL_end; 1738 } 1739 return e; 1740 } 1741 1742 static Rex_t* 1743 rep(Cenv_t* env, Rex_t* e, int number, int last) 1744 { 1745 Rex_t* f; 1746 unsigned long m = 0; 1747 unsigned long n = RE_DUP_INF; 1748 int minimal = -1; 1749 1750 if (!e) 1751 return 0; 1752 switch (token(env)) 1753 { 1754 case T_BANG: 1755 eat(env); 1756 if (!(f = node(env, REX_NEG, m, n, 0))) 1757 { 1758 drop(env->disc, e); 1759 return 0; 1760 } 1761 f->re.group.expr.rex = e; 1762 return f; 1763 case T_QUES: 1764 eat(env); 1765 n = 1; 1766 break; 1767 case T_STAR: 1768 eat(env); 1769 break; 1770 case T_PLUS: 1771 eat(env); 1772 m = 1; 1773 break; 1774 case T_LEFT: 1775 eat(env); 1776 m = env->token.min; 1777 n = env->token.max; 1778 break; 1779 default: 1780 return e; 1781 } 1782 if (env->token.att) 1783 minimal = 1; 1784 else if (env->type < SRE) 1785 switch (token(env)) 1786 { 1787 case T_QUES: 1788 eat(env); 1789 minimal = !(env->flags & REG_MINIMAL); 1790 break; 1791 case T_STAR: /*AST*/ 1792 eat(env); 1793 minimal = !!(env->flags & REG_MINIMAL); 1794 break; 1795 } 1796 switch (e->type) 1797 { 1798 case REX_DOT: 1799 case REX_CLASS: 1800 case REX_COLL_CLASS: 1801 case REX_ONECHAR: 1802 e->lo = m; 1803 e->hi = n; 1804 if (minimal >= 0) 1805 mark(e, minimal); 1806 return e; 1807 #if HUH_2002_08_07 1808 case REX_BEG: 1809 #endif 1810 case REX_BEG_STR: 1811 case REX_END_STR: 1812 case REX_FIN_STR: 1813 case REX_WBEG: 1814 case REX_WEND: 1815 case REX_WORD: 1816 case REX_WORD_NOT: 1817 env->error = REG_BADRPT; 1818 drop(env->disc, e); 1819 return 0; 1820 } 1821 if (m == 1 && n == 1) 1822 { 1823 if (minimal >= 0) 1824 mark(e, minimal); 1825 return e; 1826 } 1827 if (!(f = node(env, REX_REP, m, n, 0))) 1828 { 1829 drop(env->disc, e); 1830 return 0; 1831 } 1832 f->re.group.expr.rex = e; 1833 f->re.group.number = number; 1834 f->re.group.last = last; 1835 if (minimal >= 0) 1836 mark(f, minimal); 1837 if (m <= n && n) 1838 { 1839 for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex); 1840 if (e && e->type == REX_NEG) 1841 f->type = REX_GROUP; 1842 } 1843 return f; 1844 } 1845 1846 static int 1847 isstring(Rex_t* e) 1848 { 1849 switch (e->type) 1850 { 1851 case REX_ONECHAR: 1852 return e->lo == 1 && e->hi == 1; 1853 case REX_STRING: 1854 return 1; 1855 } 1856 return 0; 1857 } 1858 1859 static Trie_node_t* 1860 trienode(Cenv_t* env, int c) 1861 { 1862 Trie_node_t* t; 1863 1864 if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t))) 1865 { 1866 memset(t, 0, sizeof(Trie_node_t)); 1867 t->c = c; 1868 } 1869 return t; 1870 } 1871 1872 static int 1873 insert(Cenv_t* env, Rex_t* f, Rex_t* g) 1874 { 1875 unsigned char* s; 1876 unsigned char* e; 1877 Trie_node_t* t; 1878 int len; 1879 unsigned char tmp[2]; 1880 1881 switch (f->type) 1882 { 1883 case REX_ONECHAR: 1884 *(s = tmp) = f->re.onechar; 1885 e = s + 1; 1886 break; 1887 case REX_STRING: 1888 s = f->re.string.base; 1889 e = s + f->re.string.size; 1890 break; 1891 default: 1892 return 1; 1893 } 1894 if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s))) 1895 return 1; 1896 for (len = 1;;) 1897 { 1898 if (t->c == *s) 1899 { 1900 if (++s >= e) 1901 break; 1902 if (!t->son && !(t->son = trienode(env, *s))) 1903 return 1; 1904 t = t->son; 1905 len++; 1906 } 1907 else 1908 { 1909 if (!t->sib && !(t->sib = trienode(env, *s))) 1910 return 1; 1911 t = t->sib; 1912 } 1913 } 1914 if (g->re.trie.min > len) 1915 g->re.trie.min = len; 1916 if (g->re.trie.max < len) 1917 g->re.trie.max = len; 1918 t->end = 1; 1919 return 0; 1920 } 1921 1922 /* 1923 * trie() tries to combine nontrivial e and f into a REX_TRIE 1924 * unless 0 is returned, e and f are deleted as far as possible 1925 * also called by regcomb 1926 */ 1927 1928 static Rex_t* 1929 trie(Cenv_t* env, Rex_t* e, Rex_t* f) 1930 { 1931 Rex_t* g; 1932 1933 if (e->next || f->next || !isstring(e) || e->flags != f->flags) 1934 return 0; 1935 if (isstring(f)) 1936 { 1937 if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*)))) 1938 return 0; 1939 g->re.trie.min = INT_MAX; 1940 if (insert(env, f, g)) 1941 goto nospace; 1942 drop(env->disc, f); 1943 } 1944 else if (f->type != REX_TRIE) 1945 return 0; 1946 else 1947 g = f; 1948 if (insert(env, e, g)) 1949 goto nospace; 1950 drop(env->disc, e); 1951 return g; 1952 nospace: 1953 if (g != f) 1954 drop(env->disc, g); 1955 return 0; 1956 } 1957 1958 static Rex_t* alt(Cenv_t*, int, int); 1959 1960 static int 1961 chr(register Cenv_t* env, int* escaped) 1962 { 1963 unsigned char* p; 1964 int c; 1965 1966 *escaped = 0; 1967 if (!(c = *env->cursor)) 1968 return -1; 1969 env->cursor++; 1970 if (c == '\\') 1971 { 1972 if (env->flags & REG_SHELL_ESCAPED) 1973 return c; 1974 if (!(c = *(env->cursor + 1)) || c == env->terminator) 1975 { 1976 if (env->flags & REG_LENIENT) 1977 return c ? c : '\\'; 1978 env->error = REG_EESCAPE; 1979 return -1; 1980 } 1981 p = env->cursor; 1982 c = chresc((char*)env->cursor - 1, (char**)&env->cursor); 1983 *escaped = env->cursor - p; 1984 } 1985 return c; 1986 } 1987 1988 /* 1989 * open the perly gates 1990 */ 1991 1992 static Rex_t* 1993 grp(Cenv_t* env, int parno) 1994 { 1995 Rex_t* e; 1996 Rex_t* f; 1997 int c; 1998 int i; 1999 int n; 2000 int x; 2001 int esc; 2002 int typ; 2003 int beg; 2004 unsigned char* p; 2005 2006 beg = env->pattern == env->cursor - env->token.len; 2007 if (!(c = env->token.lex) && (c = *env->cursor)) 2008 env->cursor++; 2009 env->token.len = 0; 2010 env->parnest++; 2011 typ = -1; 2012 switch (c) 2013 { 2014 case '-': 2015 case '+': 2016 case 'a': 2017 case 'g': 2018 case 'i': 2019 case 'l': 2020 case 'm': 2021 case 'p': 2022 case 'r': 2023 case 's': 2024 case 'x': 2025 case 'A': 2026 case 'B': 2027 case 'E': 2028 case 'F': 2029 case 'G': 2030 case 'I': 2031 case 'K': 2032 case 'L': 2033 case 'M': /* glob(3) */ 2034 case 'N': /* glob(3) */ 2035 case 'O': /* glob(3) */ 2036 case 'R': /* pcre */ 2037 case 'S': 2038 case 'U': /* pcre */ 2039 case 'X': /* pcre */ 2040 x = REX_GROUP; 2041 i = 1; 2042 env->token.push = 1; 2043 for (;;) 2044 { 2045 switch (c) 2046 { 2047 case ')': 2048 if (!(env->flags & REG_LITERAL)) 2049 { 2050 env->error = REG_BADRPT; 2051 return 0; 2052 } 2053 /*FALLTHROUGH*/ 2054 case 0: 2055 case T_CLOSE: 2056 x = 0; 2057 goto done; 2058 case ':': 2059 eat(env); 2060 if (token(env) == T_CLOSE) 2061 x = 0; 2062 goto done; 2063 case '-': 2064 i = 0; 2065 break; 2066 case '+': 2067 i = 1; 2068 break; 2069 case 'a': 2070 if (i) 2071 env->flags |= (REG_LEFT|REG_RIGHT); 2072 else 2073 env->flags &= ~(REG_LEFT|REG_RIGHT); 2074 break; 2075 case 'g': 2076 if (i) 2077 env->flags &= ~REG_MINIMAL; 2078 else 2079 env->flags |= REG_MINIMAL; 2080 break; 2081 case 'i': 2082 if (i) 2083 env->flags |= REG_ICASE; 2084 else 2085 env->flags &= ~REG_ICASE; 2086 break; 2087 case 'l': 2088 if (i) 2089 env->flags |= REG_LEFT; 2090 else 2091 env->flags &= ~REG_LEFT; 2092 break; 2093 case 'm': 2094 if (i) 2095 env->flags |= REG_NEWLINE; 2096 else 2097 env->flags &= ~REG_NEWLINE; 2098 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 2099 break; 2100 case 'p': 2101 if (i) 2102 env->flags &= ~REG_LENIENT; 2103 else 2104 env->flags |= REG_LENIENT; 2105 break; 2106 case 'r': 2107 if (i) 2108 env->flags |= REG_RIGHT; 2109 else 2110 env->flags &= ~REG_RIGHT; 2111 break; 2112 case 's': 2113 if (i) 2114 env->flags |= REG_SPAN; 2115 else 2116 env->flags &= ~REG_SPAN; 2117 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 2118 break; 2119 case 'x': 2120 if (i) 2121 env->flags |= REG_COMMENT; 2122 else 2123 env->flags &= ~REG_COMMENT; 2124 break; 2125 case 'A': 2126 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2127 env->flags |= REG_AUGMENTED|REG_EXTENDED; 2128 typ = ARE; 2129 break; 2130 case 'B': 2131 case 'G': 2132 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2133 typ = BRE; 2134 break; 2135 case 'E': 2136 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2137 env->flags |= REG_EXTENDED; 2138 typ = ERE; 2139 break; 2140 case 'F': 2141 case 'L': 2142 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2143 env->flags |= REG_LITERAL; 2144 typ = ERE; 2145 break; 2146 case 'K': 2147 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2148 env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT; 2149 typ = KRE; 2150 break; 2151 case 'M': 2152 /* used by caller to disable glob(3) GLOB_BRACE */ 2153 break; 2154 case 'N': 2155 /* used by caller to disable glob(3) GLOB_NOCHECK */ 2156 break; 2157 case 'O': 2158 /* used by caller to disable glob(3) GLOB_STARSTAR */ 2159 break; 2160 case 'S': 2161 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2162 env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT; 2163 typ = SRE; 2164 break; 2165 case 'U': /* PCRE_UNGREEDY */ 2166 if (i) 2167 env->flags |= REG_MINIMAL; 2168 else 2169 env->flags &= ~REG_MINIMAL; 2170 break; 2171 case 'X': /* PCRE_EXTRA */ 2172 break; 2173 default: 2174 env->error = REG_BADRPT; 2175 return 0; 2176 } 2177 eat(env); 2178 c = token(env); 2179 } 2180 done: 2181 break; 2182 case ':': 2183 x = REX_GROUP; 2184 break; 2185 case '=': 2186 x = REX_GROUP_AHEAD; 2187 break; 2188 case '!': 2189 x = REX_GROUP_AHEAD_NOT; 2190 break; 2191 case '<': 2192 switch (token(env)) 2193 { 2194 case '=': 2195 x = REX_GROUP_BEHIND; 2196 break; 2197 case '!': 2198 case T_BANG: 2199 x = REX_GROUP_BEHIND_NOT; 2200 break; 2201 default: 2202 env->error = REG_BADRPT; 2203 return 0; 2204 } 2205 eat(env); 2206 break; 2207 case '>': 2208 x = REX_GROUP_CUT; 2209 break; 2210 case '%': 2211 case T_PERCENT: 2212 e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short)); 2213 e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor; 2214 n = 1; 2215 for (;;) 2216 { 2217 switch (i = chr(env, &esc)) 2218 { 2219 case -1: 2220 case 0: 2221 invalid: 2222 env->cursor -= esc + 1; 2223 env->error = REG_EPAREN; 2224 return 0; 2225 case 'D': 2226 x = REX_NEST_delimiter; 2227 /*FALLTHROUGH*/ 2228 delimiter: 2229 if ((i = chr(env, &esc)) < 0) 2230 goto invalid; 2231 if (e->re.nest.type[i] & ~x) 2232 goto invalid; 2233 e->re.nest.type[i] = x; 2234 continue; 2235 case 'E': 2236 x = REX_NEST_escape; 2237 goto delimiter; 2238 case 'L': 2239 x = REX_NEST_literal; 2240 goto quote; 2241 case 'O': 2242 switch (i = chr(env, &esc)) 2243 { 2244 case 'T': 2245 e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator; 2246 break; 2247 default: 2248 goto invalid; 2249 } 2250 continue; 2251 case 'Q': 2252 x = REX_NEST_quote; 2253 /*FALLTHROUGH*/ 2254 quote: 2255 if ((i = chr(env, &esc)) < 0) 2256 goto invalid; 2257 if (e->re.nest.type[i] & ~x) 2258 goto invalid; 2259 e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT); 2260 continue; 2261 case 'S': 2262 x = REX_NEST_separator; 2263 goto delimiter; 2264 case 'T': 2265 x = REX_NEST_terminator; 2266 goto delimiter; 2267 case '|': 2268 case '&': 2269 if (!esc) 2270 goto invalid; 2271 goto nesting; 2272 case '(': 2273 if (!esc) 2274 n++; 2275 goto nesting; 2276 case ')': 2277 if (!esc && !--n) 2278 break; 2279 goto nesting; 2280 default: 2281 nesting: 2282 if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 2283 goto invalid; 2284 e->re.nest.type[i] = REX_NEST_open; 2285 if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 2286 goto invalid; 2287 if (!esc) 2288 { 2289 if (x == ')' && !--n) 2290 goto invalid; 2291 else if (x == '(') 2292 n++; 2293 } 2294 e->re.nest.type[x] |= REX_NEST_close; 2295 e->re.nest.type[i] |= x << REX_NEST_SHIFT; 2296 continue; 2297 } 2298 break; 2299 } 2300 env->parnest--; 2301 if (c == T_PERCENT) 2302 for (n = 0; n < 2; n++) 2303 { 2304 parno = ++env->parno; 2305 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2306 { 2307 drop(env->disc, e); 2308 return 0; 2309 } 2310 if (parno < elementsof(env->paren)) 2311 env->paren[parno] = f; 2312 f->re.group.back = 0; 2313 f->re.group.number = parno; 2314 f->re.group.expr.rex = e; 2315 e = f; 2316 } 2317 return e; 2318 case '(': 2319 c = 0; 2320 if (isdigit(*env->cursor)) 2321 { 2322 f = 0; 2323 do 2324 { 2325 if (c > (INT_MAX / 10)) 2326 { 2327 env->error = REG_BADRPT; 2328 return 0; 2329 } 2330 c = c * 10 + (*env->cursor++ - '0'); 2331 } while (isdigit(*env->cursor)); 2332 if (*env->cursor++ != ')') 2333 { 2334 env->error = REG_BADRPT; 2335 return 0; 2336 } 2337 if (c && env->type >= SRE) 2338 c = c * 2 - 1; 2339 if (!c || c > env->parno || !env->paren[c]) 2340 { 2341 if (!(env->flags & REG_LENIENT)) 2342 { 2343 env->error = REG_ESUBREG; 2344 return 0; 2345 } 2346 if (c) 2347 c = -1; 2348 } 2349 } 2350 else 2351 { 2352 if (env->type < SRE && *env->cursor++ != '?') 2353 { 2354 env->error = REG_BADRPT; 2355 return 0; 2356 } 2357 if (!(f = grp(env, parno + 1)) && env->error) 2358 return 0; 2359 } 2360 if (!(e = node(env, REX_GROUP_COND, 0, 0, 0))) 2361 { 2362 drop(env->disc, f); 2363 return 0; 2364 } 2365 e->re.group.size = c; 2366 e->re.group.expr.binary.left = f; 2367 if (!(e->re.group.expr.binary.right = alt(env, parno, 1))) 2368 { 2369 drop(env->disc, e); 2370 return 0; 2371 } 2372 if (token(env) != T_CLOSE) 2373 { 2374 env->error = REG_EPAREN; 2375 return 0; 2376 } 2377 eat(env); 2378 env->parnest--; 2379 return rep(env, e, parno, parno); 2380 case '{': 2381 p = env->cursor; 2382 n = 1; 2383 while (c = *env->cursor) 2384 { 2385 if (c == '\\' && *(env->cursor + 1)) 2386 env->cursor++; 2387 else if (c == '{') 2388 n++; 2389 else if (c == '}' && !--n) 2390 break; 2391 else if (c == env->delimiter || c == env->terminator) 2392 break; 2393 env->cursor++; 2394 } 2395 if (c != '}') 2396 { 2397 env->error = REG_EBRACE; 2398 return 0; 2399 } 2400 if (*++env->cursor != ')') 2401 { 2402 env->error = REG_EPAREN; 2403 return 0; 2404 } 2405 env->cursor++; 2406 env->parnest--; 2407 if (env->disc->re_version < REG_VERSION_EXEC) 2408 { 2409 env->error = REG_BADRPT; 2410 return 0; 2411 } 2412 if (!env->disc->re_execf) 2413 return 0; 2414 if (!(e = node(env, REX_EXEC, 0, 0, 0))) 2415 return 0; 2416 e->re.exec.text = (const char*)p; 2417 e->re.exec.size = env->cursor - p - 2; 2418 if (!env->disc->re_compf) 2419 e->re.exec.data = 0; 2420 else 2421 e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc); 2422 return e; 2423 case '0': case '1': case '2': case '3': case '4': 2424 case '5': case '6': case '7': case '8': case '9': 2425 c -= '0'; 2426 while (isdigit(*env->cursor)) 2427 { 2428 if (c > (INT_MAX / 10)) 2429 { 2430 env->error = REG_ESUBREG; 2431 return 0; 2432 } 2433 c = c * 10 + *env->cursor++ - '0'; 2434 } 2435 if (*env->cursor == ')') 2436 { 2437 env->cursor++; 2438 env->parnest--; 2439 env->token.len = 1; 2440 if (c > env->parno || !env->paren[c]) 2441 { 2442 env->error = REG_ESUBREG; 2443 return 0; 2444 } 2445 env->paren[c]->re.group.back = 1; 2446 return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 2447 } 2448 /*FALLTHROUGH*/ 2449 default: 2450 env->error = REG_BADRPT; 2451 return 0; 2452 } 2453 if (x && !(e = alt(env, parno, 0))) 2454 return 0; 2455 c = token(env); 2456 env->parnest--; 2457 if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')')) 2458 { 2459 env->error = REG_EPAREN; 2460 return 0; 2461 } 2462 eat(env); 2463 if (typ >= 0) 2464 { 2465 if (beg) 2466 env->pattern = env->cursor; 2467 env->type = typ; 2468 } 2469 if (!x) 2470 return 0; 2471 if (!(f = node(env, x, 0, 0, 0))) 2472 { 2473 drop(env->disc, e); 2474 return 0; 2475 } 2476 f->re.group.expr.rex = e; 2477 if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT) 2478 { 2479 if (stats(env, e)) 2480 { 2481 drop(env->disc, f); 2482 if (!env->error) 2483 env->error = REG_ECOUNT; 2484 return 0; 2485 } 2486 f->re.group.size = env->stats.m; 2487 memset(&env->stats, 0, sizeof(env->stats)); 2488 } 2489 switch (x) 2490 { 2491 case REX_GROUP: 2492 case REX_GROUP_CUT: 2493 f = rep(env, f, parno, env->parno); 2494 break; 2495 } 2496 return f; 2497 } 2498 2499 static Rex_t* 2500 seq(Cenv_t* env) 2501 { 2502 Rex_t* e; 2503 Rex_t* f; 2504 Token_t tok; 2505 int c; 2506 int i; 2507 int n; 2508 int x; 2509 int parno; 2510 int type; 2511 regflags_t flags; 2512 unsigned char* s; 2513 unsigned char* p; 2514 unsigned char* t; 2515 unsigned char* u; 2516 unsigned char buf[256]; 2517 2518 for (;;) 2519 { 2520 s = buf; 2521 while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len]) 2522 { 2523 x = c; 2524 p = env->cursor; 2525 if (c >= 0) 2526 { 2527 n = 1; 2528 *s++ = (env->flags & REG_ICASE) ? toupper(c) : c; 2529 } 2530 else if (c == C_ESC || (env->flags & REG_ICASE)) 2531 { 2532 c = (c == C_ESC) ? env->token.lex : mbchar(p); 2533 if (env->flags & REG_ICASE) 2534 c = towupper(c); 2535 if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX) 2536 break; 2537 if ((n = wctomb((char*)s, c)) < 0) 2538 *s++ = c; 2539 else if (n) 2540 s += n; 2541 else 2542 { 2543 n = 1; 2544 *s++ = 0; 2545 } 2546 } 2547 else 2548 { 2549 n = env->token.len - env->token.esc; 2550 for (t = p, u = s + n; s < u; *s++ = *t++); 2551 } 2552 eat(env); 2553 } 2554 if (c == T_BAD) 2555 return 0; 2556 if (s > buf) 2557 switch (c) 2558 { 2559 case T_STAR: 2560 case T_PLUS: 2561 case T_LEFT: 2562 case T_QUES: 2563 case T_BANG: 2564 if ((s -= n) == buf) 2565 e = 0; 2566 else 2567 { 2568 i = s - buf; 2569 if (!(e = node(env, REX_STRING, 0, 0, i))) 2570 return 0; 2571 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i); 2572 e->re.string.size = i; 2573 } 2574 if (x >= 0) 2575 { 2576 if (!(f = node(env, REX_ONECHAR, 1, 1, 0))) 2577 { 2578 drop(env->disc, e); 2579 return 0; 2580 } 2581 f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x; 2582 } 2583 else 2584 { 2585 if (!(f = node(env, REX_STRING, 0, 0, n))) 2586 return 0; 2587 memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n); 2588 f->re.string.size = n; 2589 } 2590 if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env)))) 2591 { 2592 drop(env->disc, e); 2593 return 0; 2594 } 2595 if (e) 2596 f = cat(env, e, f); 2597 return f; 2598 default: 2599 c = s - buf; 2600 if (!(e = node(env, REX_STRING, 0, 0, c))) 2601 return 0; 2602 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c); 2603 e->re.string.size = c; 2604 return cat(env, e, seq(env)); 2605 } 2606 else if (c > T_BACK) 2607 { 2608 eat(env); 2609 c -= T_BACK; 2610 if (c > env->parno || !env->paren[c]) 2611 { 2612 env->error = REG_ESUBREG; 2613 return 0; 2614 } 2615 env->paren[c]->re.group.back = 1; 2616 e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 2617 } 2618 else 2619 switch (c) 2620 { 2621 case T_AND: 2622 case T_CLOSE: 2623 case T_BAR: 2624 case T_END: 2625 return node(env, REX_NULL, 0, 0, 0); 2626 case T_DOLL: 2627 eat(env); 2628 e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0); 2629 break; 2630 case T_CFLX: 2631 eat(env); 2632 if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED)) 2633 e = rep(env, e, 0, 0); 2634 break; 2635 case T_OPEN: 2636 tok = env->token; 2637 eat(env); 2638 flags = env->flags; 2639 type = env->type; 2640 if (env->token.att) 2641 env->flags |= REG_MINIMAL; 2642 env->parnest++; 2643 if (env->type == KRE) 2644 ++env->parno; 2645 parno = ++env->parno; 2646 if (!(e = alt(env, parno + 1, 0))) 2647 break; 2648 if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL)) 2649 { 2650 drop(env->disc, e); 2651 env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL; 2652 return 0; 2653 } 2654 if (token(env) != T_CLOSE) 2655 { 2656 drop(env->disc, e); 2657 env->error = REG_EPAREN; 2658 return 0; 2659 } 2660 env->parnest--; 2661 eat(env); 2662 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2663 { 2664 drop(env->disc, e); 2665 return 0; 2666 } 2667 if (parno < elementsof(env->paren)) 2668 env->paren[parno] = f; 2669 f->re.group.back = 0; 2670 f->re.group.number = parno; 2671 f->re.group.expr.rex = e; 2672 if (tok.lex) 2673 { 2674 tok.push = 1; 2675 env->token = tok; 2676 } 2677 if (!(e = rep(env, f, parno, env->parno))) 2678 return 0; 2679 if (env->type == KRE) 2680 { 2681 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2682 { 2683 drop(env->disc, e); 2684 return 0; 2685 } 2686 if (--parno < elementsof(env->paren)) 2687 env->paren[parno] = f; 2688 f->re.group.back = 0; 2689 f->re.group.number = parno; 2690 f->re.group.expr.rex = e; 2691 e = f; 2692 } 2693 env->flags = flags; 2694 env->type = type; 2695 break; 2696 case T_GROUP: 2697 p = env->cursor; 2698 eat(env); 2699 flags = env->flags; 2700 type = env->type; 2701 if (!(e = grp(env, env->parno + 1))) 2702 { 2703 if (env->error) 2704 return 0; 2705 if (env->literal == env->pattern && env->literal == p) 2706 env->literal = env->cursor; 2707 continue; 2708 } 2709 env->flags = flags; 2710 env->type = type; 2711 break; 2712 case T_BRA: 2713 eat(env); 2714 if (e = bra(env)) 2715 e = rep(env, e, 0, 0); 2716 break; 2717 case T_ALNUM: 2718 case T_ALNUM_NOT: 2719 case T_DIGIT: 2720 case T_DIGIT_NOT: 2721 case T_SPACE: 2722 case T_SPACE_NOT: 2723 eat(env); 2724 if (e = ccl(env, c)) 2725 e = rep(env, e, 0, 0); 2726 break; 2727 case T_LT: 2728 eat(env); 2729 e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0); 2730 break; 2731 case T_GT: 2732 eat(env); 2733 e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0); 2734 break; 2735 case T_DOT: 2736 eat(env); 2737 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 2738 break; 2739 case T_DOTSTAR: 2740 eat(env); 2741 env->token.lex = T_STAR; 2742 env->token.push = 1; 2743 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 2744 break; 2745 case T_SLASHPLUS: 2746 eat(env); 2747 env->token.lex = T_PLUS; 2748 env->token.push = 1; 2749 if (e = node(env, REX_ONECHAR, 1, 1, 0)) 2750 { 2751 e->re.onechar = '/'; 2752 e = rep(env, e, 0, 0); 2753 } 2754 break; 2755 case T_WORD: 2756 eat(env); 2757 e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0); 2758 break; 2759 case T_WORD_NOT: 2760 eat(env); 2761 e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0); 2762 break; 2763 case T_BEG_STR: 2764 eat(env); 2765 e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0); 2766 break; 2767 case T_END_STR: 2768 eat(env); 2769 e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0); 2770 break; 2771 case T_FIN_STR: 2772 eat(env); 2773 e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0); 2774 break; 2775 default: 2776 env->error = REG_BADRPT; 2777 return 0; 2778 } 2779 if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator) 2780 e = cat(env, e, seq(env)); 2781 return e; 2782 } 2783 } 2784 2785 static Rex_t* 2786 con(Cenv_t* env) 2787 { 2788 Rex_t* e; 2789 Rex_t* f; 2790 Rex_t* g; 2791 2792 if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND) 2793 return e; 2794 eat(env); 2795 if (!(f = con(env))) 2796 { 2797 drop(env->disc, e); 2798 return 0; 2799 } 2800 if (!(g = node(env, REX_CONJ, 0, 0, 0))) 2801 { 2802 drop(env->disc, e); 2803 drop(env->disc, f); 2804 return 0; 2805 } 2806 g->re.group.expr.binary.left = e; 2807 g->re.group.expr.binary.right = f; 2808 return g; 2809 } 2810 2811 static Rex_t* 2812 alt(Cenv_t* env, int number, int cond) 2813 { 2814 Rex_t* e; 2815 Rex_t* f; 2816 Rex_t* g; 2817 2818 if (!(e = con(env))) 2819 return 0; 2820 else if (token(env) != T_BAR) 2821 { 2822 if (!cond) 2823 return e; 2824 f = 0; 2825 if (e->type == REX_NULL) 2826 goto bad; 2827 } 2828 else 2829 { 2830 eat(env); 2831 if (!(f = alt(env, number, 0))) 2832 { 2833 drop(env->disc, e); 2834 return 0; 2835 } 2836 if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL)) 2837 goto bad; 2838 if (!cond && (g = trie(env, e, f))) 2839 return g; 2840 } 2841 if (!(g = node(env, REX_ALT, 0, 0, 0))) 2842 { 2843 env->error = REG_ESPACE; 2844 goto bad; 2845 } 2846 g->re.group.number = number; 2847 g->re.group.last = env->parno; 2848 g->re.group.expr.binary.left = e; 2849 g->re.group.expr.binary.right = f; 2850 return g; 2851 bad: 2852 drop(env->disc, e); 2853 drop(env->disc, f); 2854 if (!env->error) 2855 env->error = REG_ENULL; 2856 return 0; 2857 } 2858 2859 /* 2860 * add v to REX_BM tables 2861 */ 2862 2863 static void 2864 bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b) 2865 { 2866 int c; 2867 int m; 2868 size_t z; 2869 2870 for (m = 0; m < n; m++) 2871 { 2872 if (!(z = n - m - 1)) 2873 z = HIT; 2874 c = v[m]; 2875 a->re.bm.mask[m][c] |= b; 2876 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 2877 a->re.bm.skip[c] = z; 2878 if (a->flags & REG_ICASE) 2879 { 2880 if (isupper(c)) 2881 c = tolower(c); 2882 else if (islower(c)) 2883 c = toupper(c); 2884 else 2885 continue; 2886 a->re.bm.mask[m][c] |= b; 2887 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 2888 a->re.bm.skip[c] = z; 2889 } 2890 } 2891 } 2892 2893 /* 2894 * set up BM table from trie 2895 */ 2896 2897 static int 2898 bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b) 2899 { 2900 do 2901 { 2902 v[m] = x->c; 2903 if (m >= (n - 1)) 2904 { 2905 bmstr(env, a, v, n, b); 2906 if (!(b <<= 1)) 2907 { 2908 b = 1; 2909 a->re.bm.complete = 0; 2910 } 2911 else if (x->son) 2912 a->re.bm.complete = 0; 2913 } 2914 else if (x->son) 2915 b = bmtrie(env, a, v, x->son, n, m + 1, b); 2916 } while (x = x->sib); 2917 return b; 2918 } 2919 2920 /* 2921 * rewrite the expression tree for some special cases 2922 * 1. it is a null expression - illegal 2923 * 2. max length fixed string found -- use BM algorithm 2924 * 3. it begins with an unanchored string - use KMP algorithm 2925 * 0 returned on success 2926 */ 2927 2928 static int 2929 special(Cenv_t* env, regex_t* p) 2930 { 2931 Rex_t* a; 2932 Rex_t* e; 2933 Rex_t* t; 2934 Rex_t* x; 2935 Rex_t* y; 2936 unsigned char* s; 2937 int* f; 2938 int n; 2939 int m; 2940 int k; 2941 2942 DEBUG_INIT(); 2943 if (e = p->env->rex) 2944 { 2945 if ((x = env->stats.x) && x->re.string.size < 3) 2946 x = 0; 2947 if ((t = env->stats.y) && t->re.trie.min < 3) 2948 t = 0; 2949 if (x && t) 2950 { 2951 if (x->re.string.size >= t->re.trie.min) 2952 t = 0; 2953 else 2954 x = 0; 2955 } 2956 if (x || t) 2957 { 2958 Bm_mask_t** mask; 2959 Bm_mask_t* h; 2960 unsigned char* v; 2961 size_t* q; 2962 unsigned long l; 2963 int i; 2964 int j; 2965 2966 if (x) 2967 { 2968 y = x; 2969 n = m = x->re.string.size; 2970 l = env->stats.l; 2971 } 2972 else 2973 { 2974 y = t; 2975 n = t->re.trie.min; 2976 m = t->re.trie.max; 2977 l = env->stats.k; 2978 } 2979 if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t)))) 2980 return 1; 2981 if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t)))) 2982 { 2983 alloc(env->disc, q, 0); 2984 return 1; 2985 } 2986 a->flags = y->flags; 2987 a->map = y->map; 2988 a->re.bm.size = n; 2989 a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1; 2990 a->re.bm.left = l - 1; 2991 a->re.bm.right = env->stats.m - l - n; 2992 a->re.bm.complete = (y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n; 2993 h = (Bm_mask_t*)&a->re.bm.mask[n]; 2994 a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1)); 2995 a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1]; 2996 for (m = 0; m <= UCHAR_MAX; m++) 2997 a->re.bm.skip[m] = n; 2998 a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left); 2999 for (i = 1; i <= n; i++) 3000 a->re.bm.fail[i] = 2 * n - i; 3001 mask = a->re.bm.mask; 3002 for (m = 0; m < n; m++) 3003 { 3004 mask[m] = h; 3005 h += UCHAR_MAX + 1; 3006 } 3007 if (x) 3008 bmstr(env, a, x->re.string.base, n, 1); 3009 else 3010 { 3011 v = (unsigned char*)q; 3012 memset(v, 0, n); 3013 m = 1; 3014 for (i = 0; i <= UCHAR_MAX; i++) 3015 if (t->re.trie.root[i]) 3016 m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m); 3017 } 3018 mask--; 3019 memset(q, 0, n * sizeof(*q)); 3020 for (k = (j = n) + 1; j > 0; j--, k--) 3021 { 3022 DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0)); 3023 for (q[j] = k; k <= n; k = q[k]) 3024 { 3025 for (m = 0; m <= UCHAR_MAX; m++) 3026 if (mask[k][m] == mask[j][m]) 3027 { 3028 DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0)); 3029 goto cut; 3030 } 3031 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0)); 3032 if (a->re.bm.fail[k] > n - j) 3033 a->re.bm.fail[k] = n - j; 3034 } 3035 cut: ; 3036 } 3037 for (i = 1; i <= n; i++) 3038 if (a->re.bm.fail[i] > n + k - i) 3039 { 3040 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0)); 3041 a->re.bm.fail[i] = n + k - i; 3042 } 3043 #if _AST_REGEX_DEBUG 3044 if (DEBUG_TEST(0x0020,(1),(0))) 3045 { 3046 sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0); 3047 for (m = 0; m < n; m++) 3048 for (i = 1; i <= UCHAR_MAX; i++) 3049 if (a->re.bm.mask[m][i]) 3050 sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]); 3051 for (i = ' '; i <= UCHAR_MAX; i++) 3052 if (a->re.bm.skip[i] >= HIT) 3053 sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i); 3054 else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n) 3055 sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]); 3056 for (j = 31; j >= 0; j--) 3057 { 3058 sfprintf(sfstderr, " "); 3059 next: 3060 for (m = 0; m < n; m++) 3061 { 3062 for (i = 0040; i < 0177; i++) 3063 if (a->re.bm.mask[m][i] & (1 << j)) 3064 { 3065 sfprintf(sfstderr, " %c", i); 3066 break; 3067 } 3068 if (i >= 0177) 3069 { 3070 if (j > 0) 3071 { 3072 j--; 3073 goto next; 3074 } 3075 sfprintf(sfstderr, " ?"); 3076 } 3077 } 3078 sfprintf(sfstderr, "\n"); 3079 } 3080 sfprintf(sfstderr, "FAIL: "); 3081 for (m = 1; m <= n; m++) 3082 sfprintf(sfstderr, "%3d", a->re.bm.fail[m]); 3083 sfprintf(sfstderr, "\n"); 3084 } 3085 #endif 3086 alloc(env->disc, q, 0); 3087 a->next = e; 3088 p->env->rex = a; 3089 return 0; 3090 } 3091 switch (e->type) 3092 { 3093 case REX_BEG: 3094 if (env->flags & REG_NEWLINE) 3095 return 0; 3096 break; 3097 case REX_GROUP: 3098 if (env->stats.b) 3099 return 0; 3100 e = e->re.group.expr.rex; 3101 if (e->type != REX_DOT) 3102 return 0; 3103 /*FALLTHROUGH*/ 3104 case REX_DOT: 3105 if (e->lo == 0 && e->hi == RE_DUP_INF) 3106 break; 3107 return 0; 3108 case REX_NULL: 3109 if (env->flags & REG_NULL) 3110 break; 3111 env->error = REG_ENULL; 3112 return 1; 3113 case REX_STRING: 3114 if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map) 3115 return 0; 3116 s = e->re.string.base; 3117 n = e->re.string.size; 3118 if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1)))) 3119 return 1; 3120 a->flags = e->flags; 3121 a->map = e->map; 3122 f = a->re.string.fail; 3123 memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n); 3124 s = a->re.string.base; 3125 a->re.string.size = n; 3126 f[0] = m = -1; 3127 for (k = 1; k < n; k++) 3128 { 3129 while (m >= 0 && s[m+1] != s[k]) 3130 m = f[m]; 3131 if (s[m+1] == s[k]) 3132 m++; 3133 f[k] = m; 3134 } 3135 a->next = e->next; 3136 p->env->rex = a; 3137 e->next = 0; 3138 drop(env->disc, e); 3139 break; 3140 default: 3141 return 0; 3142 } 3143 } 3144 p->env->once = 1; 3145 return 0; 3146 } 3147 3148 int 3149 regcomp(regex_t* p, const char* pattern, regflags_t flags) 3150 { 3151 Rex_t* e; 3152 Rex_t* f; 3153 regdisc_t* disc; 3154 unsigned char* fold; 3155 int i; 3156 Cenv_t env; 3157 3158 if (!p) 3159 return REG_BADPAT; 3160 if (flags & REG_DISCIPLINE) 3161 { 3162 flags &= ~REG_DISCIPLINE; 3163 disc = p->re_disc; 3164 } 3165 else 3166 disc = &state.disc; 3167 if (!disc->re_errorlevel) 3168 disc->re_errorlevel = 2; 3169 p->env = 0; 3170 if (!pattern) 3171 return fatal(disc, REG_BADPAT, pattern); 3172 if (!state.initialized) 3173 { 3174 state.initialized = 1; 3175 for (i = 0; i < elementsof(state.escape); i++) 3176 state.magic[state.escape[i].key] = state.escape[i].val; 3177 } 3178 if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data)) 3179 { 3180 if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1))) 3181 return fatal(disc, REG_ESPACE, pattern); 3182 for (i = 0; i <= UCHAR_MAX; i++) 3183 fold[i] = toupper(i); 3184 LCINFO(AST_LC_CTYPE)->data = (void*)fold; 3185 } 3186 again: 3187 if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t)))) 3188 return fatal(disc, REG_ESPACE, pattern); 3189 memset(p->env, 0, sizeof(*p->env)); 3190 memset(&env, 0, sizeof(env)); 3191 env.regex = p; 3192 env.flags = flags; 3193 env.disc = p->env->disc = disc; 3194 if (env.flags & REG_AUGMENTED) 3195 env.flags |= REG_EXTENDED; 3196 env.mappeddot = '.'; 3197 env.mappednewline = '\n'; 3198 env.mappedslash = '/'; 3199 if (disc->re_version >= REG_VERSION_MAP && disc->re_map) 3200 { 3201 env.map = disc->re_map; 3202 env.MAP = p->env->fold; 3203 for (i = 0; i <= UCHAR_MAX; i++) 3204 { 3205 env.MAP[i] = fold[env.map[i]]; 3206 if (env.map[i] == '.') 3207 env.mappeddot = i; 3208 if (env.map[i] == '\n') 3209 env.mappednewline = i; 3210 if (env.map[i] == '/') 3211 env.mappedslash = i; 3212 } 3213 } 3214 else 3215 env.MAP = fold; 3216 env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE; 3217 env.explicit = -1; 3218 if (env.flags & REG_SHELL) 3219 { 3220 if (env.flags & REG_SHELL_PATH) 3221 env.explicit = env.mappedslash; 3222 env.flags |= REG_LENIENT|REG_NULL; 3223 env.type = env.type == BRE ? SRE : KRE; 3224 } 3225 if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE) 3226 env.explicit = env.mappednewline; 3227 p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1; 3228 env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL)); 3229 env.token.lex = 0; 3230 env.token.push = 0; 3231 if (env.flags & REG_DELIMITED) 3232 { 3233 switch (env.delimiter = *pattern++) 3234 { 3235 case 0: 3236 case '\\': 3237 case '\n': 3238 case '\r': 3239 env.error = REG_EDELIM; 3240 goto bad; 3241 } 3242 env.terminator = '\n'; 3243 } 3244 env.literal = env.pattern = env.cursor = (unsigned char*)pattern; 3245 if (!(p->env->rex = alt(&env, 1, 0))) 3246 goto bad; 3247 if (env.parnest) 3248 { 3249 env.error = REG_EPAREN; 3250 goto bad; 3251 } 3252 p->env->stats.re_flags = env.flags & (REG_EXTENDED|REG_AUGMENTED|REG_SHELL); 3253 if (env.flags & REG_LEFT) 3254 { 3255 if (p->env->rex->type != REX_BEG) 3256 { 3257 if (p->env->rex->type == REX_ALT) 3258 env.flags &= ~REG_FIRST; 3259 if (!(e = node(&env, REX_BEG, 0, 0, 0))) 3260 { 3261 regfree(p); 3262 return fatal(disc, REG_ESPACE, pattern); 3263 } 3264 e->next = p->env->rex; 3265 p->env->rex = e; 3266 p->env->once = 1; 3267 } 3268 p->env->stats.re_flags |= REG_LEFT; 3269 } 3270 for (e = p->env->rex; e->next; e = e->next); 3271 p->env->done.type = REX_DONE; 3272 p->env->done.flags = e->flags; 3273 if (env.flags & REG_RIGHT) 3274 { 3275 if (e->type != REX_END) 3276 { 3277 if (p->env->rex->type == REX_ALT) 3278 env.flags &= ~REG_FIRST; 3279 if (!(f = node(&env, REX_END, 0, 0, 0))) 3280 { 3281 regfree(p); 3282 return fatal(disc, REG_ESPACE, pattern); 3283 } 3284 f->flags = e->flags; 3285 f->map = e->map; 3286 e->next = f; 3287 } 3288 p->env->stats.re_flags |= REG_RIGHT; 3289 } 3290 if (stats(&env, p->env->rex)) 3291 { 3292 if (!env.error) 3293 env.error = REG_ECOUNT; 3294 goto bad; 3295 } 3296 if (env.stats.b) 3297 p->env->hard = p->env->separate = 1; 3298 else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c))) 3299 p->env->hard = 1; 3300 if (p->env->hard || env.stats.c || env.stats.i) 3301 p->env->stats.re_min = p->env->stats.re_max = -1; 3302 else 3303 { 3304 if (!(p->env->stats.re_min = env.stats.m)) 3305 p->env->stats.re_min = -1; 3306 if (!(p->env->stats.re_max = env.stats.n)) 3307 p->env->stats.re_max = -1; 3308 } 3309 if (special(&env, p)) 3310 goto bad; 3311 serialize(&env, p->env->rex, 1); 3312 p->re_nsub = env.stats.p; 3313 if (env.type == KRE) 3314 p->re_nsub /= 2; 3315 if (env.flags & REG_DELIMITED) 3316 { 3317 p->re_npat = env.cursor - env.pattern + 1; 3318 if (*env.cursor == env.delimiter) 3319 p->re_npat++; 3320 else if (env.flags & REG_MUSTDELIM) 3321 { 3322 env.error = REG_EDELIM; 3323 goto bad; 3324 } 3325 else 3326 env.flags &= ~REG_DELIMITED; 3327 } 3328 p->env->explicit = env.explicit; 3329 p->env->flags = env.flags & REG_COMP; 3330 p->env->min = env.stats.m; 3331 p->env->nsub = env.stats.p + env.stats.u; 3332 p->env->refs = 1; 3333 return 0; 3334 bad: 3335 regfree(p); 3336 if (!env.error) 3337 env.error = REG_ESPACE; 3338 if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL)) 3339 { 3340 flags |= REG_LITERAL; 3341 pattern = (const char*)env.literal; 3342 goto again; 3343 } 3344 return fatal(disc, env.error, pattern); 3345 } 3346 3347 /* 3348 * regcomp() on sized pattern 3349 * the lazy way around adding and checking env.end 3350 */ 3351 3352 int 3353 regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags) 3354 { 3355 char* s; 3356 int r; 3357 3358 if (!(s = malloc(size + 1))) 3359 return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern); 3360 memcpy(s, pattern, size); 3361 s[size] = 0; 3362 r = regcomp(p, s, flags); 3363 free(s); 3364 return r; 3365 } 3366 3367 /* 3368 * combine two compiled regular expressions if possible, 3369 * replacing first with the combination and freeing second. 3370 * return 0 on success. 3371 * the only combinations handled are building a Trie 3372 * from String|Kmp|Trie and String|Kmp 3373 */ 3374 3375 int 3376 regcomb(regex_t* p, regex_t* q) 3377 { 3378 Rex_t* e = p->env->rex; 3379 Rex_t* f = q->env->rex; 3380 Rex_t* g; 3381 Cenv_t env; 3382 3383 if (!e || !f) 3384 return fatal(p->env->disc, REG_BADPAT, NiL); 3385 if (p->env->separate || q->env->separate) 3386 return REG_ESUBREG; 3387 memset(&env, 0, sizeof(env)); 3388 env.disc = p->env->disc; 3389 if (e->type == REX_BM) 3390 { 3391 p->env->rex = e->next; 3392 e->next = 0; 3393 drop(env.disc, e); 3394 e = p->env->rex; 3395 } 3396 if (f->type == REX_BM) 3397 { 3398 q->env->rex = f->next; 3399 f->next = 0; 3400 drop(env.disc, f); 3401 f = q->env->rex; 3402 } 3403 if (e->type == REX_BEG && f->type == REX_BEG) 3404 { 3405 p->env->flags |= REG_LEFT; 3406 p->env->rex = e->next; 3407 e->next = 0; 3408 drop(env.disc, e); 3409 e = p->env->rex; 3410 q->env->rex = f->next; 3411 f->next = 0; 3412 drop(env.disc, f); 3413 f = q->env->rex; 3414 } 3415 if (e->next && e->next->type == REX_END && f->next && f->next->type == REX_END) 3416 { 3417 p->env->flags |= REG_RIGHT; 3418 drop(env.disc, e->next); 3419 e->next = 0; 3420 drop(env.disc, f->next); 3421 f->next = 0; 3422 } 3423 if (!(g = trie(&env, f, e))) 3424 return fatal(p->env->disc, REG_BADPAT, NiL); 3425 p->env->rex = g; 3426 if (!q->env->once) 3427 p->env->once = 0; 3428 q->env->rex = 0; 3429 if (p->env->flags & REG_LEFT) 3430 { 3431 if (!(e = node(&env, REX_BEG, 0, 0, 0))) 3432 { 3433 regfree(p); 3434 return fatal(p->env->disc, REG_ESPACE, NiL); 3435 } 3436 e->next = p->env->rex; 3437 p->env->rex = e; 3438 p->env->once = 1; 3439 } 3440 if (p->env->flags & REG_RIGHT) 3441 { 3442 for (f = p->env->rex; f->next; f = f->next); 3443 if (f->type != REX_END) 3444 { 3445 if (!(e = node(&env, REX_END, 0, 0, 0))) 3446 { 3447 regfree(p); 3448 return fatal(p->env->disc, REG_ESPACE, NiL); 3449 } 3450 f->next = e; 3451 } 3452 } 3453 env.explicit = p->env->explicit; 3454 env.flags = p->env->flags; 3455 env.disc = p->env->disc; 3456 if (stats(&env, p->env->rex)) 3457 { 3458 regfree(p); 3459 return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL); 3460 } 3461 if (special(&env, p)) 3462 { 3463 regfree(p); 3464 return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL); 3465 } 3466 p->env->min = g->re.trie.min; 3467 return 0; 3468 } 3469 3470 /* 3471 * copy a reference of p into q 3472 * p and q may then have separate regsubcomp() instantiations 3473 */ 3474 3475 int 3476 regdup(regex_t* p, regex_t* q) 3477 { 3478 if (!p || !q) 3479 return REG_BADPAT; 3480 *q = *p; 3481 p->env->refs++; 3482 q->re_sub = 0; 3483 return 0; 3484 } 3485