1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2009 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * posix regex compiler 26 */ 27 28 #include "reglib.h" 29 30 #if _PACKAGE_ast 31 #include "lclib.h" 32 #endif 33 34 #define serialize re_serialize /* hp.ia64 <unistd.h>! */ 35 36 #define C_ESC (-1) 37 #define C_MB (-2) 38 39 #if _AST_REGEX_DEBUG 40 41 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) 42 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) 43 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0) 44 45 static unsigned long debug; 46 static unsigned long debug_flag; 47 48 #else 49 50 #define DEBUG_INIT() 51 #define DEBUG_TEST(f,y,n) (n) 52 #define DEBUG_CODE(f,y,n) do {n} while(0) 53 54 #endif 55 56 #if _PACKAGE_ast 57 58 typedef struct Cchr_s 59 { 60 Dtlink_t lnk; 61 unsigned char nam[2]; 62 Ckey_t key; 63 } Cchr_t; 64 65 #endif 66 67 #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0) 68 69 /* 70 * determine whether greedy matching will work, i.e. produce 71 * the best match first. such expressions are "easy", and 72 * need no backtracking once a complete match is found. 73 * if an expression has backreferences or alts it's hard 74 * else if it has only one closure it's easy 75 * else if all closures are simple (i.e. one-character) it's easy 76 * else it's hard. 77 */ 78 79 typedef struct Stats_s 80 { 81 unsigned long l; /* min length to left of x */ 82 unsigned long k; /* min length to left of y */ 83 unsigned long m; /* min length */ 84 unsigned long n; /* max length */ 85 unsigned short a; /* number of alternations */ 86 unsigned short b; /* number of backrefs */ 87 unsigned short c; /* number of closures */ 88 unsigned short i; /* number of negations */ 89 unsigned short p; /* number of named subexpressions */ 90 unsigned short s; /* number of simple closures */ 91 unsigned short t; /* number of tries */ 92 unsigned short u; /* number of unnamed subexpressions */ 93 Rex_t* x; /* max length REX_STRING */ 94 Rex_t* y; /* max length REX_TRIE */ 95 } Stats_t; 96 97 typedef struct Token_s 98 { 99 unsigned long min; 100 unsigned long max; 101 short lex; 102 short len; 103 short esc; 104 short att; 105 short push; 106 } Token_t; 107 108 typedef struct Cenv_s 109 { 110 int delimiter; /* pattern delimiter */ 111 int error; /* last error */ 112 int explicit; /* explicit match on this char */ 113 int mappeddot; /* inverse mapped '.' */ 114 int mappednewline; /* inverse mapped '\n' */ 115 int mappedslash; /* inverse mapped '/' */ 116 regflags_t flags; /* flags arg to regcomp */ 117 int type; /* BRE,ERE,ARE,SRE,KRE */ 118 unsigned char* cursor; /* curent point in re */ 119 unsigned char* pattern; /* the original pattern */ 120 unsigned char* literal; /* literal restart pattern */ 121 int parno; /* number of last open paren */ 122 int parnest; /* paren nest count */ 123 int posixkludge; /* to make * nonspecial */ 124 int regexp; /* <regexp.h> compatibility */ 125 Token_t token; /* token lookahead */ 126 Stats_t stats; /* RE statistics */ 127 int terminator; /* pattern terminator */ 128 Rex_t* paren[2*(BACK_REF_MAX+2)]; 129 /* paren[i]!=0 if \i defined */ 130 regex_t* regex; /* user handle */ 131 regdisc_t* disc; /* user discipline */ 132 unsigned char* map; /* external to native ccode map */ 133 unsigned char* MAP; /* fold and/or map */ 134 } Cenv_t; 135 136 /* 137 * allocate a new Rex_t node 138 */ 139 140 static Rex_t* 141 node(Cenv_t* env, int type, int lo, int hi, size_t extra) 142 { 143 register Rex_t* e; 144 145 if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra)) 146 { 147 memset(e, 0, sizeof(Rex_t) + extra); 148 e->type = type; 149 e->marked = 0; 150 e->lo = lo; 151 e->hi = hi; 152 e->flags = env->flags; 153 e->map = (e->flags & REG_ICASE) ? env->MAP : env->map; 154 e->explicit = env->explicit; 155 if (extra) 156 e->re.data = (char*)e + sizeof(Rex_t); 157 } 158 return e; 159 } 160 161 /* 162 * free a Trie_node_t node 163 */ 164 165 static void 166 triedrop(regdisc_t* disc, Trie_node_t* e) 167 { 168 if (e) 169 { 170 triedrop(disc, e->son); 171 triedrop(disc, e->sib); 172 alloc(disc, e, 0); 173 } 174 } 175 176 /* 177 * free a Rex_t node 178 */ 179 180 void 181 drop(regdisc_t* disc, Rex_t* e) 182 { 183 int i; 184 Rex_t* x; 185 186 if (e && !(disc->re_flags & REG_NOFREE)) 187 do 188 { 189 switch (e->type) 190 { 191 case REX_ALT: 192 case REX_CONJ: 193 drop(disc, e->re.group.expr.binary.left); 194 drop(disc, e->re.group.expr.binary.right); 195 break; 196 case REX_GROUP: 197 case REX_GROUP_AHEAD: 198 case REX_GROUP_AHEAD_NOT: 199 case REX_GROUP_BEHIND: 200 case REX_GROUP_BEHIND_NOT: 201 case REX_GROUP_CUT: 202 case REX_NEG: 203 case REX_REP: 204 drop(disc, e->re.group.expr.rex); 205 break; 206 case REX_TRIE: 207 for (i = 0; i <= UCHAR_MAX; i++) 208 triedrop(disc, e->re.trie.root[i]); 209 break; 210 } 211 x = e->next; 212 alloc(disc, e, 0); 213 } while (e = x); 214 } 215 216 /* 217 * mark e and descendants minimal 218 */ 219 220 static void 221 mark(register Rex_t* e, int set) 222 { 223 if (e && !e->marked) 224 do 225 { 226 e->marked = 1; 227 if (set) 228 e->flags |= REG_MINIMAL; 229 else 230 e->flags &= ~REG_MINIMAL; 231 switch (e->type) 232 { 233 case REX_ALT: 234 case REX_CONJ: 235 case REX_GROUP_COND: 236 if (e->re.group.expr.binary.left) 237 mark(e->re.group.expr.binary.left, set); 238 if (e->re.group.expr.binary.right) 239 mark(e->re.group.expr.binary.right, set); 240 break; 241 case REX_GROUP: 242 case REX_GROUP_AHEAD: 243 case REX_GROUP_AHEAD_NOT: 244 case REX_GROUP_BEHIND: 245 case REX_GROUP_BEHIND_NOT: 246 case REX_GROUP_CUT: 247 case REX_NEG: 248 case REX_REP: 249 case REX_TRIE: 250 mark(e->re.group.expr.rex, set); 251 break; 252 } 253 } while (e = e->next); 254 } 255 256 /* 257 * assign subexpression numbers by a preorder tree walk 258 */ 259 260 static int 261 serialize(Cenv_t* env, Rex_t* e, int n) 262 { 263 do 264 { 265 e->serial = n++; 266 switch (e->type) 267 { 268 case REX_ALT: 269 case REX_GROUP_COND: 270 if (e->re.group.expr.binary.left) 271 n = serialize(env, e->re.group.expr.binary.left, n); 272 e->re.group.expr.binary.serial = n++; 273 if (e->re.group.expr.binary.right) 274 n = serialize(env, e->re.group.expr.binary.right, n); 275 break; 276 case REX_CONJ: 277 n = serialize(env, e->re.group.expr.binary.left, n); 278 n = serialize(env, e->re.group.expr.binary.right, n); 279 break; 280 case REX_GROUP: 281 case REX_GROUP_AHEAD: 282 case REX_GROUP_AHEAD_NOT: 283 case REX_GROUP_BEHIND: 284 case REX_GROUP_BEHIND_NOT: 285 case REX_GROUP_CUT: 286 case REX_NEG: 287 case REX_REP: 288 n = serialize(env, e->re.group.expr.rex, n); 289 break; 290 } 291 } while (e = e->next); 292 return n; 293 } 294 295 /* 296 * catenate e and f into a sequence, collapsing them if possible 297 */ 298 299 static Rex_t* 300 cat(Cenv_t* env, Rex_t* e, Rex_t* f) 301 { 302 Rex_t* g; 303 304 if (!f) 305 { 306 drop(env->disc, e); 307 return 0; 308 } 309 if (e->type == REX_NULL) 310 { 311 drop(env->disc, e); 312 return f; 313 } 314 if (f->type == REX_NULL) 315 { 316 g = f->next; 317 f->next = 0; 318 drop(env->disc, f); 319 f = g; 320 } 321 else if (e->type == REX_DOT && f->type == REX_DOT) 322 { 323 unsigned int m = e->lo + f->lo; 324 unsigned int n = e->hi + f->hi; 325 326 if (m <= RE_DUP_MAX) 327 { 328 if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX) 329 { 330 n = RE_DUP_INF; 331 goto combine; 332 } 333 else if (n <= RE_DUP_MAX) 334 { 335 combine: 336 e->lo = m; 337 e->hi = n; 338 g = f->next; 339 f->next = 0; 340 drop(env->disc, f); 341 f = g; 342 } 343 } 344 } 345 e->next = f; 346 return e; 347 } 348 349 /* 350 * collect re statistics 351 */ 352 353 static int 354 stats(register Cenv_t* env, register Rex_t* e) 355 { 356 register unsigned long n; 357 register unsigned long m; 358 unsigned long cm; 359 unsigned long nm; 360 unsigned long cn; 361 unsigned long nn; 362 unsigned long l; 363 unsigned long k; 364 unsigned long t; 365 Rex_t* q; 366 Rex_t* x; 367 Rex_t* y; 368 unsigned char c; 369 unsigned char b; 370 371 do 372 { 373 switch (e->type) 374 { 375 case REX_ALT: 376 x = env->stats.x; 377 l = env->stats.l; 378 y = env->stats.y; 379 k = env->stats.k; 380 t = env->stats.t; 381 if (++env->stats.a <= 0) 382 return 1; 383 cm = env->stats.m; 384 env->stats.m = 0; 385 cn = env->stats.n; 386 env->stats.n = 0; 387 if (stats(env, e->re.group.expr.binary.left)) 388 return 1; 389 m = env->stats.m; 390 env->stats.m = 0; 391 n = env->stats.n; 392 env->stats.n = 0; 393 if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right)) 394 return 1; 395 if (env->stats.m > m) 396 env->stats.m = m; 397 else 398 m = env->stats.m; 399 if ((env->stats.m += cm) < m) 400 return 1; 401 if (env->stats.n < n) 402 env->stats.n = n; 403 else 404 n = env->stats.n; 405 if ((env->stats.n += cn) < n) 406 return 1; 407 env->stats.x = x; 408 env->stats.l = l; 409 env->stats.y = y; 410 env->stats.k = k; 411 env->stats.t = t; 412 break; 413 case REX_BACK: 414 if (++env->stats.b <= 0) 415 return 1; 416 break; 417 case REX_CLASS: 418 case REX_COLL_CLASS: 419 case REX_DOT: 420 case REX_ONECHAR: 421 n = env->stats.m; 422 if ((env->stats.m += e->lo) < n) 423 return 1; 424 if (e->hi != RE_DUP_INF) 425 { 426 n = env->stats.n; 427 if ((env->stats.n += e->hi) < n) 428 return 1; 429 } 430 if (e->lo != e->hi) 431 { 432 if (++env->stats.c <= 0) 433 return 1; 434 if (++env->stats.s <= 0) 435 return 1; 436 } 437 break; 438 case REX_CONJ: 439 cm = env->stats.m; 440 env->stats.m = 0; 441 cn = env->stats.n; 442 env->stats.n = 0; 443 if (stats(env, e->re.group.expr.binary.left)) 444 return 1; 445 nm = env->stats.m; 446 env->stats.m = 0; 447 nn = env->stats.n; 448 env->stats.n = 0; 449 if (stats(env, e->re.group.expr.binary.right)) 450 return 1; 451 if (env->stats.m < nm) 452 env->stats.m = nm; 453 else 454 nm = env->stats.m; 455 if ((env->stats.m += cm) < nm) 456 return 1; 457 if (env->stats.n < nn) 458 env->stats.n = nn; 459 else 460 nn = env->stats.n; 461 if ((env->stats.n += cn) < nn) 462 return 1; 463 break; 464 case REX_GROUP: 465 if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0) 466 return 1; 467 if (stats(env, e->re.group.expr.rex)) 468 return 1; 469 break; 470 case REX_GROUP_AHEAD: 471 case REX_GROUP_AHEAD_NOT: 472 case REX_GROUP_BEHIND: 473 case REX_GROUP_BEHIND_NOT: 474 m = env->stats.m; 475 n = env->stats.n; 476 x = env->stats.x; 477 y = env->stats.y; 478 if (stats(env, e->re.group.expr.rex)) 479 return 1; 480 env->stats.m = m; 481 env->stats.n = n; 482 env->stats.x = x; 483 env->stats.y = y; 484 switch (e->type) 485 { 486 case REX_GROUP_AHEAD: 487 case REX_GROUP_BEHIND: 488 if (++env->stats.u <= 0) 489 return 1; 490 break; 491 } 492 break; 493 case REX_GROUP_COND: 494 if (++env->stats.u <= 0) 495 return 1; 496 m = env->stats.m; 497 n = env->stats.n; 498 x = env->stats.x; 499 y = env->stats.y; 500 if (e->re.group.size > 0 && ++env->stats.b <= 0) 501 return 1; 502 if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left)) 503 return 1; 504 if (q = e->re.group.expr.binary.right) 505 { 506 if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left)) 507 return 1; 508 if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right)) 509 return 1; 510 } 511 env->stats.m = m; 512 env->stats.n = n; 513 env->stats.x = x; 514 env->stats.y = y; 515 break; 516 case REX_GROUP_CUT: 517 if (++env->stats.u <= 0) 518 return 1; 519 m = env->stats.m; 520 n = env->stats.n; 521 x = env->stats.x; 522 y = env->stats.y; 523 if (stats(env, e->re.group.expr.rex)) 524 return 1; 525 env->stats.m = m; 526 env->stats.n = n; 527 env->stats.x = x; 528 env->stats.y = y; 529 break; 530 case REX_NEG: 531 env->stats.i++; 532 x = env->stats.x; 533 l = env->stats.l; 534 y = env->stats.y; 535 k = env->stats.k; 536 t = env->stats.t; 537 cm = env->stats.m; 538 env->stats.m = 0; 539 if (stats(env, e->re.group.expr.rex)) 540 return 1; 541 env->stats.m = !env->stats.m; 542 if ((env->stats.m += cm) < cm) 543 return 1; 544 env->stats.x = x; 545 env->stats.l = l; 546 env->stats.y = y; 547 env->stats.k = k; 548 env->stats.t = t; 549 break; 550 case REX_REP: 551 x = env->stats.x; 552 l = env->stats.l; 553 y = env->stats.y; 554 k = env->stats.k; 555 t = env->stats.t; 556 if (++env->stats.c <= 0) 557 return 1; 558 b = env->stats.b; 559 c = env->stats.c; 560 cm = env->stats.m; 561 env->stats.m = 0; 562 if (stats(env, e->re.group.expr.rex)) 563 return 1; 564 if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0) 565 return 1; 566 if (e->lo < 1) 567 { 568 env->stats.x = x; 569 env->stats.l = l; 570 env->stats.y = y; 571 env->stats.k = k; 572 env->stats.t = t; 573 env->stats.m = cm; 574 } 575 else 576 { 577 m = env->stats.m; 578 if ((env->stats.m *= e->lo) > 0 && env->stats.m < m) 579 return 1; 580 m = env->stats.m; 581 if ((env->stats.m += cm) < m) 582 return 1; 583 if (env->stats.x != x) 584 env->stats.l = cm; 585 if (env->stats.y != y) 586 env->stats.k = cm; 587 } 588 break; 589 case REX_STRING: 590 if (!e->map) 591 { 592 cm = env->stats.m; 593 if ((env->stats.m += e->re.string.size) < cm) 594 return 1; 595 cn = env->stats.n; 596 if ((env->stats.n += e->re.string.size) < cn) 597 return 1; 598 if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size) 599 { 600 env->stats.x = e; 601 env->stats.l = cm; 602 } 603 } 604 break; 605 case REX_TRIE: 606 if (++env->stats.s <= 0) 607 return 1; 608 cm = env->stats.m; 609 if ((env->stats.m += e->re.trie.min) < cm) 610 return 1; 611 cn = env->stats.n; 612 if ((env->stats.n += e->re.trie.max) < cn) 613 return 1; 614 env->stats.t++; 615 if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min) 616 { 617 env->stats.y = e; 618 env->stats.k = cm; 619 } 620 break; 621 } 622 } while (e = e->next); 623 return 0; 624 } 625 626 static int token(Cenv_t*); 627 628 static int 629 magic(register Cenv_t* env, register int c, int escaped) 630 { 631 register char* sp; 632 register int n; 633 int o = c; 634 int e = env->error; 635 int l = env->token.len; 636 short* mp; 637 char* ep; 638 639 if (mp = state.magic[c]) 640 { 641 c = mp[env->type+escaped]; 642 if (c >= T_META) 643 { 644 sp = (char*)env->cursor + env->token.len; 645 switch (c) 646 { 647 case T_LEFT: 648 n = 0; 649 ep = sp; 650 while (*sp >= '0' && *sp <= '9') 651 { 652 if (n > (INT_MAX / 10)) 653 { 654 env->error = REG_BADBR; 655 goto bad; 656 } 657 n = n * 10 + *sp++ - '0'; 658 } 659 if (sp == ep) 660 { 661 if (env->type < SRE || *sp != ',') 662 { 663 env->error = *sp ? REG_BADBR : REG_EBRACE; 664 goto bad; 665 } 666 } 667 else if (n > RE_DUP_MAX) 668 { 669 env->error = REG_BADBR; 670 goto bad; 671 } 672 env->token.min = n; 673 if (*sp == ',') 674 { 675 n = 0; 676 ep = ++sp; 677 while (*sp >= '0' && *sp <= '9') 678 { 679 if (n > (INT_MAX / 10)) 680 { 681 env->error = REG_BADBR; 682 goto bad; 683 } 684 n = n * 10 + *sp++ - '0'; 685 } 686 if (sp == ep) 687 n = RE_DUP_INF; 688 else if (n < env->token.min) 689 { 690 env->error = REG_BADBR; 691 goto bad; 692 } 693 } 694 env->token.max = n; 695 switch (*sp) 696 { 697 case 0: 698 env->error = REG_EBRACE; 699 goto bad; 700 case '\\': 701 if (!escaped) 702 { 703 env->error = REG_BADBR; 704 goto bad; 705 } 706 sp++; 707 break; 708 default: 709 if (escaped) 710 { 711 env->error = REG_BADBR; 712 goto bad; 713 } 714 break; 715 } 716 switch (*sp++) 717 { 718 case 0: 719 env->error = REG_EBRACE; 720 goto bad; 721 case '}': 722 break; 723 default: 724 env->error = REG_BADBR; 725 goto bad; 726 } 727 env->token.len = sp - (char*)env->cursor; 728 break; 729 case T_RIGHT: 730 env->error = REG_EBRACE; 731 goto bad; 732 case T_OPEN: 733 if (env->type < SRE && *sp == '?') 734 { 735 env->token.len++; 736 env->token.lex = 0; 737 goto group; 738 } 739 break; 740 case T_ESCAPE: 741 c = chresc(sp - 2, &ep); 742 if (ep < sp) 743 goto bad; 744 env->token.len += ep - sp; 745 if (c >= T_META) 746 { 747 env->token.lex = c; 748 c = C_ESC; 749 } 750 return c; 751 case T_BACK+0: 752 case T_BACK+1: 753 case T_BACK+2: 754 case T_BACK+3: 755 case T_BACK+4: 756 case T_BACK+5: 757 case T_BACK+6: 758 case T_BACK+7: 759 n = chresc(sp - 2, &ep); 760 if (ep > sp + 1) 761 { 762 env->token.len += ep - sp; 763 return n; 764 } 765 /*FALLTHROUGH*/ 766 case T_BACK+8: 767 case T_BACK+9: 768 if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT)) 769 { 770 env->error = REG_BADESC; 771 goto bad; 772 } 773 if ((env->flags & REG_MULTIREF) && isdigit(*sp)) 774 { 775 c = (c - T_BACK) * 10 + (*sp - '0'); 776 if (c > 0 && c <= env->parno && env->paren[c]) 777 c += T_BACK; 778 else 779 c = chresc(sp - 2, &ep); 780 env->token.len++; 781 } 782 if (c == T_BACK) 783 c = 0; 784 break; 785 case T_BAD: 786 if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META) 787 return c; 788 goto bad; 789 } 790 if (env->type >= SRE) 791 { 792 if (c == T_DOT) 793 c = '.'; 794 else if (c < T_OPEN) 795 { 796 if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(') 797 { 798 env->token.len++; 799 env->token.att = 1; 800 } 801 if (env->type == KRE && *(env->cursor + env->token.len) == '(') 802 { 803 env->token.len++; 804 switch (c) 805 { 806 case T_AT: 807 break; 808 case T_PERCENT: 809 env->token.lex = c; 810 goto group; 811 case T_TILDE: 812 env->token.lex = 0; 813 goto group; 814 default: 815 env->token.lex = c; 816 break; 817 } 818 c = T_OPEN; 819 } 820 else if (c == T_STAR) 821 c = T_DOTSTAR; 822 else if (c == T_QUES) 823 c = T_DOT; 824 else 825 { 826 c = o; 827 env->token.len = l; 828 } 829 } 830 else if (c > T_BACK) 831 { 832 c = (c - T_BACK) * 2 - 1; 833 c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c; 834 } 835 else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP)) 836 { 837 if (c == T_AND) 838 c = '&'; 839 else if (c == T_BAR) 840 c = '|'; 841 else if (c == T_OPEN) 842 c = '('; 843 } 844 } 845 } 846 } 847 else if (escaped == 2) 848 { 849 if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter)) 850 /*ok*/; 851 else 852 { 853 env->error = REG_BADESC; 854 goto bad; 855 } 856 } 857 else if (escaped && !(env->flags & REG_LENIENT) && c != ']') 858 { 859 env->error = REG_BADESC; 860 goto bad; 861 } 862 return c; 863 group: 864 sp = (char*)env->cursor + env->token.len; 865 switch (*sp++) 866 { 867 case ')': 868 break; 869 case '#': 870 for (;;) 871 { 872 switch (*sp++) 873 { 874 case 0: 875 env->error = REG_EPAREN; 876 return T_BAD; 877 case ')': 878 break; 879 default: 880 continue; 881 } 882 break; 883 } 884 break; 885 default: 886 return T_GROUP; 887 } 888 env->cursor = (unsigned char*)sp; 889 return token(env); 890 bad: 891 if (escaped == 2) 892 env->error = e; 893 else if (env->flags & REG_LENIENT) 894 return o; 895 else if (escaped == 1 && !env->error) 896 { 897 if (mp || o == ']') 898 return o; 899 env->error = REG_BADESC; 900 } 901 return T_BAD; 902 } 903 904 static int 905 token(register Cenv_t* env) 906 { 907 int c; 908 int posixkludge; 909 910 if (env->token.push) 911 return env->token.lex; 912 env->token.att = env->token.esc = 0; 913 if ((env->token.len = MBSIZE(env->cursor)) > 1) 914 return env->token.lex = C_MB; 915 env->token.lex = 0; 916 for (;;) 917 { 918 c = *env->cursor; 919 if (c == 0 || c == env->delimiter || c == env->terminator) 920 return T_END; 921 if (!(env->flags & REG_COMMENT)) 922 break; 923 if (c == '#') 924 { 925 do 926 { 927 c = *++env->cursor; 928 if (c == 0 || c == env->delimiter) 929 return T_END; 930 } while (c != '\n'); 931 } 932 else if (!isspace(c)) 933 break; 934 env->cursor++; 935 } 936 if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter) 937 { 938 if (env->parnest) 939 { 940 env->error = REG_EPAREN; 941 return T_BAD; 942 } 943 env->parno = 0; 944 env->pattern = env->cursor + 1; 945 return T_BAR; 946 } 947 if (env->flags & REG_LITERAL) 948 return c; 949 if (posixkludge = env->posixkludge) 950 { 951 env->posixkludge = 0; 952 if (c == '*') 953 return c; 954 } 955 if (c == '\\') 956 { 957 if (env->flags & REG_SHELL_ESCAPED) 958 return c; 959 if (!(c = *(env->cursor + 1)) || c == env->terminator) 960 { 961 if (env->flags & REG_LENIENT) 962 { 963 if (c) 964 { 965 env->token.esc = env->token.len; 966 env->token.len += MBSIZE(env->cursor + 1); 967 return c; 968 } 969 return '\\'; 970 } 971 env->error = REG_EESCAPE; 972 return T_BAD; 973 } 974 env->token.esc = env->token.len; 975 env->token.len += MBSIZE(env->cursor + 1); 976 if (env->delimiter && c == 'n') 977 return '\n'; 978 else if (c == env->delimiter) 979 return magic(env, c, 0); 980 else if (c == '(' && env->type == BRE) 981 env->posixkludge = 1; 982 else if (c == ')' && env->type == BRE && env->parnest <= 0) 983 { 984 env->error = REG_EPAREN; 985 return T_BAD; 986 } 987 else if (isspace(c) && (env->flags & REG_COMMENT)) 988 return c; 989 return magic(env, c, 1); 990 } 991 else if (c == '$') 992 { 993 if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n') 994 return T_DOLL; 995 } 996 else if (c == '^') 997 { 998 if (env->type == BRE && (env->cursor == env->pattern || posixkludge)) 999 { 1000 env->posixkludge = 1; 1001 return T_CFLX; 1002 } 1003 } 1004 else if (c == ')') 1005 { 1006 if (env->type != BRE && env->parnest <= 0) 1007 return c; 1008 } 1009 else if (c == '/' && env->explicit == env->mappedslash) 1010 { 1011 while (*(env->cursor + env->token.len) == c) 1012 env->token.len++; 1013 return T_SLASHPLUS; 1014 } 1015 return magic(env, c, 0); 1016 } 1017 1018 static Celt_t* 1019 col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec) 1020 { 1021 register char* s; 1022 register unsigned char* k; 1023 register unsigned char* e; 1024 register int c; 1025 register int cc; 1026 int bt; 1027 int et; 1028 Ckey_t key; 1029 1030 cc = 0; 1031 for (;;) 1032 { 1033 k = key; 1034 if (bw == 1) 1035 { 1036 c = bc; 1037 if (ic) 1038 { 1039 if (isupper(c)) 1040 { 1041 c = tolower(c); 1042 cc = -1; 1043 } 1044 else if (islower(c)) 1045 { 1046 c = toupper(c); 1047 cc = -1; 1048 } 1049 } 1050 *k++ = c; 1051 } 1052 else if (bw < COLL_KEY_MAX) 1053 { 1054 s = (char*)bp; 1055 if (ic) 1056 { 1057 c = mbchar(s); 1058 if (iswupper(c)) 1059 { 1060 c = towlower(c); 1061 cc = 1; 1062 } 1063 else if (iswlower(c)) 1064 { 1065 c = towupper(c); 1066 cc = 1; 1067 } 1068 } 1069 if (cc > 0) 1070 { 1071 cc = -1; 1072 k += wctomb((char*)k, c); 1073 } 1074 else 1075 for (e = k + bw; k < e; *k++ = *s++); 1076 } 1077 *k = 0; 1078 mbxfrm(ce->beg, key, COLL_KEY_MAX); 1079 if (ep) 1080 { 1081 k = key; 1082 c = mbchar(k); 1083 if (iswupper(c)) 1084 bt = COLL_range_uc; 1085 else if (iswlower(c)) 1086 bt = COLL_range_lc; 1087 else 1088 bt = COLL_range; 1089 k = key; 1090 if (ew == 1) 1091 { 1092 c = ec; 1093 if (ic) 1094 { 1095 if (isupper(c)) 1096 { 1097 c = tolower(c); 1098 cc = -1; 1099 } 1100 else if (islower(c)) 1101 { 1102 c = toupper(c); 1103 cc = -1; 1104 } 1105 } 1106 *k++ = c; 1107 } 1108 else if (ew < COLL_KEY_MAX) 1109 { 1110 s = (char*)ep; 1111 if (ic) 1112 { 1113 c = mbchar(s); 1114 if (iswupper(c)) 1115 { 1116 c = towlower(c); 1117 cc = 1; 1118 } 1119 else if (iswlower(c)) 1120 { 1121 c = towupper(c); 1122 cc = 1; 1123 } 1124 } 1125 if (cc > 0) 1126 { 1127 cc = -1; 1128 k += wctomb((char*)k, c); 1129 } 1130 else 1131 for (e = k + ew; k < e; *k++ = *s++); 1132 } 1133 *k = 0; 1134 mbxfrm(ce->end, key, COLL_KEY_MAX); 1135 k = key; 1136 c = mbchar(k); 1137 if (iswupper(c)) 1138 et = COLL_range_uc; 1139 else if (iswlower(c)) 1140 et = COLL_range_lc; 1141 else 1142 et = COLL_range; 1143 ce->typ = bt == et ? bt : COLL_range; 1144 } 1145 else 1146 ce->typ = COLL_char; 1147 ce++; 1148 if (!ic || !cc) 1149 break; 1150 ic = 0; 1151 } 1152 return ce; 1153 } 1154 1155 static Rex_t* 1156 bra(Cenv_t* env) 1157 { 1158 Rex_t* e; 1159 int c; 1160 int i; 1161 int w; 1162 int neg; 1163 int last; 1164 int inrange; 1165 int complicated; 1166 int collate; 1167 int elements; 1168 unsigned char* first; 1169 unsigned char* start; 1170 unsigned char* begin; 1171 unsigned char* s; 1172 regclass_t f; 1173 unsigned char buf[4 * (COLL_KEY_MAX + 1)]; 1174 #if _PACKAGE_ast 1175 int ic; 1176 char mbc[COLL_KEY_MAX + 1]; 1177 #endif 1178 1179 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 1180 return 0; 1181 collate = complicated = elements = 0; 1182 if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!') 1183 { 1184 env->cursor++; 1185 neg = 1; 1186 } 1187 else 1188 neg = 0; 1189 first = env->cursor; 1190 start = first + MBSIZE(first); 1191 if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter)) 1192 goto error; 1193 begin = env->cursor + MBSIZE(env->cursor); 1194 1195 /* 1196 * inrange: 0=no, 1=possibly, 2=definitely 1197 */ 1198 1199 inrange = 0; 1200 for (;;) 1201 { 1202 if (!(c = *env->cursor) || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 1203 goto error; 1204 env->cursor += (w = MBSIZE(env->cursor)); 1205 if (c == '\\') 1206 { 1207 if (*env->cursor) 1208 { 1209 if (*env->cursor == 'n') 1210 { 1211 env->cursor++; 1212 c = '\n'; 1213 } 1214 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 1215 { 1216 env->token.len = 1; 1217 w = magic(env, *env->cursor, 2); 1218 if (env->token.len > 1 || w != T_BAD) 1219 { 1220 if (env->token.len == 1 && (f = classfun(w))) 1221 { 1222 if (inrange > 1) 1223 { 1224 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1225 goto erange; 1226 inrange = 0; 1227 } 1228 env->cursor++; 1229 for (c = 0; c <= UCHAR_MAX; c++) 1230 if ((*f)(c)) 1231 setadd(e->re.charclass, c); 1232 complicated++; 1233 elements++; 1234 continue; 1235 } 1236 if (env->token.len > 1 || w >= 0 && w < T_META) 1237 { 1238 c = w; 1239 if (c > UCHAR_MAX) 1240 { 1241 if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide()) 1242 goto erange; 1243 c = UCHAR_MAX; 1244 } 1245 env->cursor += env->token.len; 1246 } 1247 } 1248 } 1249 } 1250 } 1251 else if (c == ']') 1252 { 1253 if (env->cursor == begin) 1254 { 1255 last = c; 1256 inrange = 1; 1257 continue; 1258 } 1259 if (inrange != 0) 1260 { 1261 setadd(e->re.charclass, last); 1262 elements++; 1263 if (inrange == 2) 1264 { 1265 setadd(e->re.charclass, '-'); 1266 elements++; 1267 } 1268 } 1269 break; 1270 } 1271 else if (c == '-') 1272 { 1273 if (!inrange && env->cursor != begin && *env->cursor != ']') 1274 { 1275 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1276 goto erange; 1277 continue; 1278 } 1279 else if (inrange == 1) 1280 { 1281 inrange = 2; 1282 complicated++; 1283 continue; 1284 } 1285 } 1286 else if (c == '[') 1287 { 1288 switch (*env->cursor) 1289 { 1290 case 0: 1291 goto error; 1292 case ':': 1293 if (env->regexp) 1294 goto normal; 1295 if (inrange == 1) 1296 { 1297 setadd(e->re.charclass, last); 1298 elements++; 1299 } 1300 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 1301 { 1302 if (env->cursor == start && (c = *(env->cursor + 1))) 1303 { 1304 s = start = env->cursor + 1; 1305 while (*++s && *s != ':'); 1306 if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']') 1307 { 1308 if ((i = (s - start)) == 1) 1309 { 1310 switch (c) 1311 { 1312 case '<': 1313 i = REX_WBEG; 1314 break; 1315 case '>': 1316 i = REX_WEND; 1317 break; 1318 default: 1319 i = 0; 1320 break; 1321 } 1322 if (i) 1323 { 1324 env->cursor = s + 3; 1325 drop(env->disc, e); 1326 return node(env, i, 0, 0, 0); 1327 } 1328 } 1329 } 1330 } 1331 env->error = REG_ECTYPE; 1332 goto error; 1333 } 1334 for (c = 0; c <= UCHAR_MAX; c++) 1335 if ((*f)(c)) 1336 setadd(e->re.charclass, c); 1337 inrange = 0; 1338 complicated++; 1339 elements++; 1340 continue; 1341 case '=': 1342 if (env->regexp) 1343 goto normal; 1344 if (inrange == 2) 1345 goto erange; 1346 if (inrange == 1) 1347 { 1348 setadd(e->re.charclass, last); 1349 elements++; 1350 } 1351 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 1352 goto ecollate; 1353 if (c > 1) 1354 collate++; 1355 else 1356 setadd(e->re.charclass, buf[0]); 1357 c = buf[0]; 1358 inrange = 0; 1359 complicated++; 1360 elements++; 1361 continue; 1362 case '.': 1363 if (env->regexp) 1364 goto normal; 1365 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 1366 goto ecollate; 1367 if (c > 1) 1368 collate++; 1369 c = buf[0]; 1370 complicated++; 1371 break; 1372 default: 1373 normal: 1374 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 1375 goto error; 1376 break; 1377 } 1378 } 1379 else if (w > 1) 1380 complicated++; 1381 if (inrange == 2) 1382 { 1383 if (last > c) 1384 { 1385 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1386 goto erange; 1387 setadd(e->re.charclass, last); 1388 setadd(e->re.charclass, c); 1389 } 1390 else 1391 for (i = last; i <= c; i++) 1392 setadd(e->re.charclass, i); 1393 inrange = env->type >= SRE || (env->flags & REG_LENIENT); 1394 elements += 2; 1395 } 1396 else if (inrange == 1) 1397 { 1398 setadd(e->re.charclass, last); 1399 elements++; 1400 } 1401 else 1402 inrange = 1; 1403 last = c; 1404 } 1405 #if _PACKAGE_ast 1406 if (complicated && mbcoll()) 1407 { 1408 Dt_t* dt; 1409 Cchr_t* cc; 1410 Cchr_t* tc; 1411 Cchr_t* xc; 1412 Celt_t* ce; 1413 Cchr_t key; 1414 int rw; 1415 int rc; 1416 int wc; 1417 unsigned char* rp; 1418 unsigned char* pp; 1419 char* wp; 1420 char cb[2][COLL_KEY_MAX+1]; 1421 1422 static Dtdisc_t disc; 1423 1424 static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; 1425 1426 if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data)) 1427 { 1428 disc.key = offsetof(Cchr_t, key); 1429 if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dttree))) 1430 { 1431 for (i = 0; i < elementsof(primary) - 1; i++, cc++) 1432 { 1433 cc->nam[0] = primary[i]; 1434 mbxfrm(cc->key, cc->nam, COLL_KEY_MAX); 1435 dtinsert(dt, cc); 1436 } 1437 for (i = 0; i < elementsof(cc->key); i++) 1438 cc->key[i] = ~0; 1439 dtinsert(dt, cc); 1440 LCINFO(AST_LC_COLLATE)->data = (void*)dt; 1441 } 1442 else 1443 { 1444 if (cc) 1445 free(cc); 1446 drop(env->disc, e); 1447 return 0; 1448 } 1449 } 1450 if (dt) 1451 { 1452 drop(env->disc, e); 1453 if (ic = env->flags & REG_ICASE) 1454 elements *= 2; 1455 if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 2) * sizeof(Celt_t)))) 1456 return 0; 1457 ce = (Celt_t*)e->re.data; 1458 e->re.collate.invert = neg; 1459 e->re.collate.elements = ce; 1460 env->cursor = first; 1461 inrange = 0; 1462 for (;;) 1463 { 1464 if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 1465 goto error; 1466 pp = env->cursor; 1467 env->cursor += (w = MBSIZE(env->cursor)); 1468 if (c == '\\') 1469 { 1470 if (*env->cursor) 1471 { 1472 if (*env->cursor == 'n') 1473 { 1474 pp = env->cursor++; 1475 c = '\n'; 1476 } 1477 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 1478 { 1479 env->token.len = 1; 1480 w = magic(env, *env->cursor, 2); 1481 if (env->token.len > 1 || w != T_BAD) 1482 { 1483 if (env->token.len == 1 && (f = classfun(w))) 1484 { 1485 if (inrange > 1) 1486 { 1487 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1488 goto erange; 1489 inrange = 0; 1490 } 1491 env->cursor++; 1492 ce->fun = f; 1493 ce->typ = COLL_call; 1494 ce++; 1495 continue; 1496 } 1497 if (env->token.len > 1 || w >= 0 && w < T_META) 1498 { 1499 c = w; 1500 w = wctomb(mbc, c); 1501 pp = (unsigned char*)mbc; 1502 env->cursor += env->token.len; 1503 } 1504 } 1505 } 1506 } 1507 } 1508 else if (c == ']') 1509 { 1510 if (env->cursor == begin) 1511 { 1512 rp = pp; 1513 rw = w; 1514 inrange = 1; 1515 continue; 1516 } 1517 if (inrange != 0) 1518 { 1519 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1520 if (inrange == 2) 1521 ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0); 1522 } 1523 break; 1524 } 1525 else if (c == '-') 1526 { 1527 if (!inrange && env->cursor != begin && *env->cursor != ']') 1528 { 1529 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1530 goto erange; 1531 continue; 1532 } 1533 else if (inrange == 1) 1534 { 1535 inrange = 2; 1536 continue; 1537 } 1538 } 1539 else if (c == '[') 1540 { 1541 switch (*env->cursor) 1542 { 1543 case 0: 1544 goto error; 1545 case ':': 1546 if (env->regexp) 1547 goto complicated_normal; 1548 if (inrange == 1) 1549 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1550 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 1551 { 1552 if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']') 1553 { 1554 switch (c) 1555 { 1556 case '<': 1557 i = REX_WBEG; 1558 break; 1559 case '>': 1560 i = REX_WEND; 1561 break; 1562 default: 1563 i = 0; 1564 break; 1565 } 1566 if (i) 1567 { 1568 env->cursor += 5; 1569 drop(env->disc, e); 1570 return node(env, i, 0, 0, 0); 1571 } 1572 } 1573 env->error = REG_ECTYPE; 1574 goto error; 1575 } 1576 ce->fun = f; 1577 ce->typ = COLL_call; 1578 ce++; 1579 inrange = 0; 1580 continue; 1581 case '=': 1582 if (env->regexp) 1583 goto complicated_normal; 1584 if (inrange == 2) 1585 goto erange; 1586 if (inrange == 1) 1587 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1588 pp = (unsigned char*)cb[inrange]; 1589 rp = env->cursor + 1; 1590 if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) 1591 goto ecollate; 1592 wp = (char*)pp; 1593 wc = mbchar(wp); 1594 c = 0; 1595 if (ic) 1596 { 1597 if (iswupper(wc)) 1598 { 1599 wc = towlower(wc); 1600 rw = wctomb((char*)pp, wc); 1601 c = 'u'; 1602 } 1603 else if (iswlower(wc)) 1604 c = 'l'; 1605 } 1606 for (;;) 1607 { 1608 mbxfrm(key.key, (char*)pp, COLL_KEY_MAX); 1609 if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key))) 1610 goto ecollate; 1611 xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc; 1612 if (c == 'l' || c == 'L' && !(c = 0)) 1613 ce->typ = COLL_range_lc; 1614 else if (c == 'u' || c == 'U' && !(c = 0)) 1615 ce->typ = COLL_range_uc; 1616 else 1617 ce->typ = COLL_range; 1618 strcpy((char*)ce->beg, (char*)xc->key); 1619 if (!(cc = (Cchr_t*)dtnext(dt, cc))) 1620 goto ecollate; 1621 if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc))) 1622 cc = tc; 1623 strcpy((char*)ce->end, (char*)cc->key); 1624 ce->max = -1; 1625 ce++; 1626 if (!c) 1627 break; 1628 if (c == 'u') 1629 { 1630 wc = towlower(wc); 1631 c = 'L'; 1632 } 1633 else 1634 { 1635 wc = towupper(wc); 1636 c = 'U'; 1637 } 1638 rw = wctomb((char*)pp, wc); 1639 } 1640 inrange = 0; 1641 c = *pp; 1642 continue; 1643 case '.': 1644 if (env->regexp) 1645 goto complicated_normal; 1646 pp = (unsigned char*)cb[inrange]; 1647 if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) 1648 goto ecollate; 1649 c = buf[0]; 1650 break; 1651 default: 1652 complicated_normal: 1653 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 1654 goto error; 1655 break; 1656 } 1657 } 1658 if (inrange == 2) 1659 { 1660 ce = col(ce, ic, rp, rw, rc, pp, w, c); 1661 if (strcmp((char*)ce->beg, (char*)ce->end) > 0) 1662 { 1663 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1664 goto erange; 1665 (ce-1)->typ = COLL_char; 1666 strcpy((char*)ce->beg, (char*)(ce-1)->end); 1667 ce->typ = COLL_char; 1668 ce++; 1669 } 1670 inrange = env->type >= SRE || (env->flags & REG_LENIENT); 1671 } 1672 else if (inrange == 1) 1673 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1674 else 1675 inrange = 1; 1676 rp = pp; 1677 rw = w; 1678 rc = c; 1679 } 1680 ce->typ = COLL_end; 1681 return e; 1682 } 1683 } 1684 #endif 1685 if (collate) 1686 goto ecollate; 1687 if (env->flags & REG_ICASE) 1688 for (i = 0; i <= UCHAR_MAX; i++) 1689 if (settst(e->re.charclass, i)) 1690 { 1691 if (isupper(i)) 1692 c = tolower(i); 1693 else if (islower(i)) 1694 c = toupper(i); 1695 else 1696 continue; 1697 setadd(e->re.charclass, c); 1698 } 1699 if (neg) 1700 { 1701 for (i = 0; i < elementsof(e->re.charclass->bits); i++) 1702 e->re.charclass->bits[i] ^= ~0; 1703 if (env->explicit >= 0) 1704 setclr(e->re.charclass, env->explicit); 1705 } 1706 return e; 1707 ecollate: 1708 env->error = REG_ECOLLATE; 1709 goto error; 1710 erange: 1711 env->error = REG_ERANGE; 1712 error: 1713 drop(env->disc, e); 1714 if (!env->error) 1715 env->error = REG_EBRACK; 1716 return 0; 1717 } 1718 1719 static Rex_t* 1720 ccl(Cenv_t* env, int type) 1721 { 1722 int i; 1723 Rex_t* e; 1724 Celt_t* ce; 1725 regclass_t f; 1726 1727 if (!(f = classfun(type))) 1728 { 1729 env->error = REG_BADESC; 1730 return 0; 1731 } 1732 if (!mbcoll()) 1733 { 1734 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 1735 return 0; 1736 for (i = 0; i <= UCHAR_MAX; i++) 1737 if ((*f)(i)) 1738 setadd(e->re.charclass, i); 1739 if (env->explicit >= 0) 1740 setclr(e->re.charclass, env->explicit); 1741 } 1742 else 1743 { 1744 if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t)))) 1745 return 0; 1746 ce = (Celt_t*)e->re.data; 1747 e->re.collate.invert = 0; 1748 e->re.collate.elements = ce; 1749 ce->fun = f; 1750 ce->typ = COLL_call; 1751 ce++; 1752 ce->typ = COLL_end; 1753 } 1754 return e; 1755 } 1756 1757 static Rex_t* 1758 rep(Cenv_t* env, Rex_t* e, int number, int last) 1759 { 1760 Rex_t* f; 1761 unsigned long m = 0; 1762 unsigned long n = RE_DUP_INF; 1763 int minimal = -1; 1764 1765 if (!e) 1766 return 0; 1767 switch (token(env)) 1768 { 1769 case T_BANG: 1770 eat(env); 1771 if (!(f = node(env, REX_NEG, m, n, 0))) 1772 { 1773 drop(env->disc, e); 1774 return 0; 1775 } 1776 f->re.group.expr.rex = e; 1777 return f; 1778 case T_QUES: 1779 eat(env); 1780 n = 1; 1781 break; 1782 case T_STAR: 1783 eat(env); 1784 break; 1785 case T_PLUS: 1786 eat(env); 1787 m = 1; 1788 break; 1789 case T_LEFT: 1790 eat(env); 1791 m = env->token.min; 1792 n = env->token.max; 1793 break; 1794 default: 1795 return e; 1796 } 1797 if (env->token.att) 1798 minimal = 1; 1799 else if (env->type < SRE) 1800 switch (token(env)) 1801 { 1802 case T_QUES: 1803 eat(env); 1804 minimal = !(env->flags & REG_MINIMAL); 1805 break; 1806 case T_STAR: /*AST*/ 1807 eat(env); 1808 minimal = !!(env->flags & REG_MINIMAL); 1809 break; 1810 } 1811 switch (e->type) 1812 { 1813 case REX_DOT: 1814 case REX_CLASS: 1815 case REX_COLL_CLASS: 1816 case REX_ONECHAR: 1817 e->lo = m; 1818 e->hi = n; 1819 if (minimal >= 0) 1820 mark(e, minimal); 1821 return e; 1822 #if HUH_2002_08_07 1823 case REX_BEG: 1824 #endif 1825 case REX_BEG_STR: 1826 case REX_END_STR: 1827 case REX_FIN_STR: 1828 case REX_WBEG: 1829 case REX_WEND: 1830 case REX_WORD: 1831 case REX_WORD_NOT: 1832 env->error = REG_BADRPT; 1833 drop(env->disc, e); 1834 return 0; 1835 } 1836 if (m == 1 && n == 1) 1837 { 1838 if (minimal >= 0) 1839 mark(e, minimal); 1840 return e; 1841 } 1842 if (!(f = node(env, REX_REP, m, n, 0))) 1843 { 1844 drop(env->disc, e); 1845 return 0; 1846 } 1847 f->re.group.expr.rex = e; 1848 f->re.group.number = number; 1849 f->re.group.last = last; 1850 if (minimal >= 0) 1851 mark(f, minimal); 1852 if (m <= n && n) 1853 { 1854 for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex); 1855 if (e && e->type == REX_NEG) 1856 f->type = REX_GROUP; 1857 } 1858 return f; 1859 } 1860 1861 static int 1862 isstring(Rex_t* e) 1863 { 1864 switch (e->type) 1865 { 1866 case REX_ONECHAR: 1867 return e->lo == 1 && e->hi == 1; 1868 case REX_STRING: 1869 return 1; 1870 } 1871 return 0; 1872 } 1873 1874 static Trie_node_t* 1875 trienode(Cenv_t* env, int c) 1876 { 1877 Trie_node_t* t; 1878 1879 if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t))) 1880 { 1881 memset(t, 0, sizeof(Trie_node_t)); 1882 t->c = c; 1883 } 1884 return t; 1885 } 1886 1887 static int 1888 insert(Cenv_t* env, Rex_t* f, Rex_t* g) 1889 { 1890 unsigned char* s; 1891 unsigned char* e; 1892 Trie_node_t* t; 1893 int len; 1894 unsigned char tmp[2]; 1895 1896 switch (f->type) 1897 { 1898 case REX_ONECHAR: 1899 *(s = tmp) = f->re.onechar; 1900 e = s + 1; 1901 break; 1902 case REX_STRING: 1903 s = f->re.string.base; 1904 e = s + f->re.string.size; 1905 break; 1906 default: 1907 return 1; 1908 } 1909 if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s))) 1910 return 1; 1911 for (len = 1;;) 1912 { 1913 if (t->c == *s) 1914 { 1915 if (++s >= e) 1916 break; 1917 if (!t->son && !(t->son = trienode(env, *s))) 1918 return 1; 1919 t = t->son; 1920 len++; 1921 } 1922 else 1923 { 1924 if (!t->sib && !(t->sib = trienode(env, *s))) 1925 return 1; 1926 t = t->sib; 1927 } 1928 } 1929 if (g->re.trie.min > len) 1930 g->re.trie.min = len; 1931 if (g->re.trie.max < len) 1932 g->re.trie.max = len; 1933 t->end = 1; 1934 return 0; 1935 } 1936 1937 /* 1938 * trie() tries to combine nontrivial e and f into a REX_TRIE 1939 * unless 0 is returned, e and f are deleted as far as possible 1940 * also called by regcomb 1941 */ 1942 1943 static Rex_t* 1944 trie(Cenv_t* env, Rex_t* e, Rex_t* f) 1945 { 1946 Rex_t* g; 1947 1948 if (e->next || f->next || !isstring(e) || e->flags != f->flags) 1949 return 0; 1950 if (isstring(f)) 1951 { 1952 if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*)))) 1953 return 0; 1954 g->re.trie.min = INT_MAX; 1955 if (insert(env, f, g)) 1956 goto nospace; 1957 drop(env->disc, f); 1958 } 1959 else if (f->type != REX_TRIE) 1960 return 0; 1961 else 1962 g = f; 1963 if (insert(env, e, g)) 1964 goto nospace; 1965 drop(env->disc, e); 1966 return g; 1967 nospace: 1968 if (g != f) 1969 drop(env->disc, g); 1970 return 0; 1971 } 1972 1973 static Rex_t* alt(Cenv_t*, int, int); 1974 1975 static int 1976 chr(register Cenv_t* env, int* escaped) 1977 { 1978 unsigned char* p; 1979 int c; 1980 1981 *escaped = 0; 1982 if (!(c = *env->cursor)) 1983 return -1; 1984 env->cursor++; 1985 if (c == '\\') 1986 { 1987 if (env->flags & REG_SHELL_ESCAPED) 1988 return c; 1989 if (!(c = *(env->cursor + 1)) || c == env->terminator) 1990 { 1991 if (env->flags & REG_LENIENT) 1992 return c ? c : '\\'; 1993 env->error = REG_EESCAPE; 1994 return -1; 1995 } 1996 p = env->cursor; 1997 c = chresc((char*)env->cursor - 1, (char**)&env->cursor); 1998 *escaped = env->cursor - p; 1999 } 2000 return c; 2001 } 2002 2003 /* 2004 * open the perly gates 2005 */ 2006 2007 static Rex_t* 2008 grp(Cenv_t* env, int parno) 2009 { 2010 Rex_t* e; 2011 Rex_t* f; 2012 int c; 2013 int i; 2014 int n; 2015 int x; 2016 int esc; 2017 int typ; 2018 int beg; 2019 unsigned char* p; 2020 2021 beg = env->pattern == env->cursor - env->token.len; 2022 if (!(c = env->token.lex) && (c = *env->cursor)) 2023 env->cursor++; 2024 env->token.len = 0; 2025 env->parnest++; 2026 typ = -1; 2027 switch (c) 2028 { 2029 case '-': 2030 case '+': 2031 case 'a': 2032 case 'g': 2033 case 'i': 2034 case 'l': 2035 case 'm': 2036 case 'p': 2037 case 'r': 2038 case 's': 2039 case 'x': 2040 case 'A': 2041 case 'B': 2042 case 'E': 2043 case 'F': 2044 case 'G': 2045 case 'I': 2046 case 'K': 2047 case 'L': 2048 case 'M': /* glob(3) */ 2049 case 'N': /* glob(3) */ 2050 case 'O': /* glob(3) */ 2051 case 'R': /* pcre */ 2052 case 'S': 2053 case 'U': /* pcre */ 2054 case 'X': /* pcre */ 2055 x = REX_GROUP; 2056 i = 1; 2057 env->token.push = 1; 2058 for (;;) 2059 { 2060 switch (c) 2061 { 2062 case ')': 2063 if (!(env->flags & REG_LITERAL)) 2064 { 2065 env->error = REG_BADRPT; 2066 return 0; 2067 } 2068 /*FALLTHROUGH*/ 2069 case 0: 2070 case T_CLOSE: 2071 x = 0; 2072 goto done; 2073 case ':': 2074 eat(env); 2075 if (token(env) == T_CLOSE) 2076 x = 0; 2077 goto done; 2078 case '-': 2079 i = 0; 2080 break; 2081 case '+': 2082 i = 1; 2083 break; 2084 case 'a': 2085 if (i) 2086 env->flags |= (REG_LEFT|REG_RIGHT); 2087 else 2088 env->flags &= ~(REG_LEFT|REG_RIGHT); 2089 break; 2090 case 'g': 2091 if (i) 2092 env->flags &= ~REG_MINIMAL; 2093 else 2094 env->flags |= REG_MINIMAL; 2095 break; 2096 case 'i': 2097 if (i) 2098 env->flags |= REG_ICASE; 2099 else 2100 env->flags &= ~REG_ICASE; 2101 break; 2102 case 'l': 2103 if (i) 2104 env->flags |= REG_LEFT; 2105 else 2106 env->flags &= ~REG_LEFT; 2107 break; 2108 case 'm': 2109 if (i) 2110 env->flags |= REG_NEWLINE; 2111 else 2112 env->flags &= ~REG_NEWLINE; 2113 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 2114 break; 2115 case 'p': 2116 if (i) 2117 env->flags &= ~REG_LENIENT; 2118 else 2119 env->flags |= REG_LENIENT; 2120 break; 2121 case 'r': 2122 if (i) 2123 env->flags |= REG_RIGHT; 2124 else 2125 env->flags &= ~REG_RIGHT; 2126 break; 2127 case 's': 2128 if (i) 2129 env->flags |= REG_SPAN; 2130 else 2131 env->flags &= ~REG_SPAN; 2132 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 2133 break; 2134 case 'x': 2135 if (i) 2136 env->flags |= REG_COMMENT; 2137 else 2138 env->flags &= ~REG_COMMENT; 2139 break; 2140 case 'A': 2141 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2142 env->flags |= REG_AUGMENTED|REG_EXTENDED; 2143 typ = ARE; 2144 break; 2145 case 'B': 2146 case 'G': 2147 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2148 typ = BRE; 2149 break; 2150 case 'E': 2151 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2152 env->flags |= REG_EXTENDED; 2153 typ = ERE; 2154 break; 2155 case 'F': 2156 case 'L': 2157 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2158 env->flags |= REG_LITERAL; 2159 typ = ERE; 2160 break; 2161 case 'K': 2162 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2163 env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT; 2164 typ = KRE; 2165 break; 2166 case 'M': 2167 /* used by caller to disable glob(3) GLOB_BRACE */ 2168 break; 2169 case 'N': 2170 /* used by caller to disable glob(3) GLOB_NOCHECK */ 2171 break; 2172 case 'O': 2173 /* used by caller to disable glob(3) GLOB_STARSTAR */ 2174 break; 2175 case 'S': 2176 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT); 2177 env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT; 2178 typ = SRE; 2179 break; 2180 case 'U': /* PCRE_UNGREEDY */ 2181 if (i) 2182 env->flags |= REG_MINIMAL; 2183 else 2184 env->flags &= ~REG_MINIMAL; 2185 break; 2186 case 'X': /* PCRE_EXTRA */ 2187 break; 2188 default: 2189 env->error = REG_BADRPT; 2190 return 0; 2191 } 2192 eat(env); 2193 c = token(env); 2194 } 2195 done: 2196 break; 2197 case ':': 2198 x = REX_GROUP; 2199 break; 2200 case '=': 2201 x = REX_GROUP_AHEAD; 2202 break; 2203 case '!': 2204 x = REX_GROUP_AHEAD_NOT; 2205 break; 2206 case '<': 2207 switch (token(env)) 2208 { 2209 case '=': 2210 x = REX_GROUP_BEHIND; 2211 break; 2212 case '!': 2213 case T_BANG: 2214 x = REX_GROUP_BEHIND_NOT; 2215 break; 2216 default: 2217 env->error = REG_BADRPT; 2218 return 0; 2219 } 2220 eat(env); 2221 break; 2222 case '>': 2223 x = REX_GROUP_CUT; 2224 break; 2225 case '%': 2226 case T_PERCENT: 2227 e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short)); 2228 e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor; 2229 n = 1; 2230 for (;;) 2231 { 2232 switch (i = chr(env, &esc)) 2233 { 2234 case -1: 2235 case 0: 2236 invalid: 2237 env->cursor -= esc + 1; 2238 env->error = REG_EPAREN; 2239 return 0; 2240 case 'D': 2241 x = REX_NEST_delimiter; 2242 /*FALLTHROUGH*/ 2243 delimiter: 2244 if ((i = chr(env, &esc)) < 0) 2245 goto invalid; 2246 if (e->re.nest.type[i] & ~x) 2247 goto invalid; 2248 e->re.nest.type[i] = x; 2249 continue; 2250 case 'E': 2251 x = REX_NEST_escape; 2252 goto delimiter; 2253 case 'L': 2254 x = REX_NEST_literal; 2255 goto quote; 2256 case 'O': 2257 switch (i = chr(env, &esc)) 2258 { 2259 case 'T': 2260 e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator; 2261 break; 2262 default: 2263 goto invalid; 2264 } 2265 continue; 2266 case 'Q': 2267 x = REX_NEST_quote; 2268 /*FALLTHROUGH*/ 2269 quote: 2270 if ((i = chr(env, &esc)) < 0) 2271 goto invalid; 2272 if (e->re.nest.type[i] & ~x) 2273 goto invalid; 2274 e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT); 2275 continue; 2276 case 'S': 2277 x = REX_NEST_separator; 2278 goto delimiter; 2279 case 'T': 2280 x = REX_NEST_terminator; 2281 goto delimiter; 2282 case '|': 2283 case '&': 2284 if (!esc) 2285 goto invalid; 2286 goto nesting; 2287 case '(': 2288 if (!esc) 2289 n++; 2290 goto nesting; 2291 case ')': 2292 if (!esc && !--n) 2293 break; 2294 goto nesting; 2295 default: 2296 nesting: 2297 if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 2298 goto invalid; 2299 e->re.nest.type[i] = REX_NEST_open; 2300 if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 2301 goto invalid; 2302 if (!esc) 2303 { 2304 if (x == ')' && !--n) 2305 goto invalid; 2306 else if (x == '(') 2307 n++; 2308 } 2309 e->re.nest.type[x] |= REX_NEST_close; 2310 e->re.nest.type[i] |= x << REX_NEST_SHIFT; 2311 continue; 2312 } 2313 break; 2314 } 2315 env->parnest--; 2316 if (c == T_PERCENT) 2317 for (n = 0; n < 2; n++) 2318 { 2319 parno = ++env->parno; 2320 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2321 { 2322 drop(env->disc, e); 2323 return 0; 2324 } 2325 if (parno < elementsof(env->paren)) 2326 env->paren[parno] = f; 2327 f->re.group.back = 0; 2328 f->re.group.number = parno; 2329 f->re.group.expr.rex = e; 2330 e = f; 2331 } 2332 return e; 2333 case '(': 2334 c = 0; 2335 if (isdigit(*env->cursor)) 2336 { 2337 f = 0; 2338 do 2339 { 2340 if (c > (INT_MAX / 10)) 2341 { 2342 env->error = REG_BADRPT; 2343 return 0; 2344 } 2345 c = c * 10 + (*env->cursor++ - '0'); 2346 } while (isdigit(*env->cursor)); 2347 if (*env->cursor++ != ')') 2348 { 2349 env->error = REG_BADRPT; 2350 return 0; 2351 } 2352 if (c && env->type >= SRE) 2353 c = c * 2 - 1; 2354 if (!c || c > env->parno || !env->paren[c]) 2355 { 2356 if (!(env->flags & REG_LENIENT)) 2357 { 2358 env->error = REG_ESUBREG; 2359 return 0; 2360 } 2361 if (c) 2362 c = -1; 2363 } 2364 } 2365 else 2366 { 2367 if (env->type < SRE && *env->cursor++ != '?') 2368 { 2369 env->error = REG_BADRPT; 2370 return 0; 2371 } 2372 if (!(f = grp(env, parno + 1)) && env->error) 2373 return 0; 2374 } 2375 if (!(e = node(env, REX_GROUP_COND, 0, 0, 0))) 2376 { 2377 drop(env->disc, f); 2378 return 0; 2379 } 2380 e->re.group.size = c; 2381 e->re.group.expr.binary.left = f; 2382 if (!(e->re.group.expr.binary.right = alt(env, parno, 1))) 2383 { 2384 drop(env->disc, e); 2385 return 0; 2386 } 2387 if (token(env) != T_CLOSE) 2388 { 2389 env->error = REG_EPAREN; 2390 return 0; 2391 } 2392 eat(env); 2393 env->parnest--; 2394 return rep(env, e, parno, parno); 2395 case '{': 2396 p = env->cursor; 2397 n = 1; 2398 while (c = *env->cursor) 2399 { 2400 if (c == '\\' && *(env->cursor + 1)) 2401 env->cursor++; 2402 else if (c == '{') 2403 n++; 2404 else if (c == '}' && !--n) 2405 break; 2406 else if (c == env->delimiter || c == env->terminator) 2407 break; 2408 env->cursor++; 2409 } 2410 if (c != '}') 2411 { 2412 env->error = REG_EBRACE; 2413 return 0; 2414 } 2415 if (*++env->cursor != ')') 2416 { 2417 env->error = REG_EPAREN; 2418 return 0; 2419 } 2420 env->cursor++; 2421 env->parnest--; 2422 if (env->disc->re_version < REG_VERSION_EXEC) 2423 { 2424 env->error = REG_BADRPT; 2425 return 0; 2426 } 2427 if (!env->disc->re_execf) 2428 return 0; 2429 if (!(e = node(env, REX_EXEC, 0, 0, 0))) 2430 return 0; 2431 e->re.exec.text = (const char*)p; 2432 e->re.exec.size = env->cursor - p - 2; 2433 if (!env->disc->re_compf) 2434 e->re.exec.data = 0; 2435 else 2436 e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc); 2437 return e; 2438 case '0': case '1': case '2': case '3': case '4': 2439 case '5': case '6': case '7': case '8': case '9': 2440 c -= '0'; 2441 while (isdigit(*env->cursor)) 2442 { 2443 if (c > (INT_MAX / 10)) 2444 { 2445 env->error = REG_ESUBREG; 2446 return 0; 2447 } 2448 c = c * 10 + *env->cursor++ - '0'; 2449 } 2450 if (*env->cursor == ')') 2451 { 2452 env->cursor++; 2453 env->parnest--; 2454 env->token.len = 1; 2455 if (c > env->parno || !env->paren[c]) 2456 { 2457 env->error = REG_ESUBREG; 2458 return 0; 2459 } 2460 env->paren[c]->re.group.back = 1; 2461 return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 2462 } 2463 /*FALLTHROUGH*/ 2464 default: 2465 env->error = REG_BADRPT; 2466 return 0; 2467 } 2468 if (x && !(e = alt(env, parno, 0))) 2469 return 0; 2470 c = token(env); 2471 env->parnest--; 2472 if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')')) 2473 { 2474 env->error = REG_EPAREN; 2475 return 0; 2476 } 2477 eat(env); 2478 if (typ >= 0) 2479 { 2480 if (beg) 2481 env->pattern = env->cursor; 2482 env->type = typ; 2483 } 2484 if (!x) 2485 return 0; 2486 if (!(f = node(env, x, 0, 0, 0))) 2487 { 2488 drop(env->disc, e); 2489 return 0; 2490 } 2491 f->re.group.expr.rex = e; 2492 if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT) 2493 { 2494 if (stats(env, e)) 2495 { 2496 drop(env->disc, f); 2497 if (!env->error) 2498 env->error = REG_ECOUNT; 2499 return 0; 2500 } 2501 f->re.group.size = env->stats.m; 2502 memset(&env->stats, 0, sizeof(env->stats)); 2503 } 2504 switch (x) 2505 { 2506 case REX_GROUP: 2507 case REX_GROUP_CUT: 2508 f = rep(env, f, parno, env->parno); 2509 break; 2510 } 2511 return f; 2512 } 2513 2514 static Rex_t* 2515 seq(Cenv_t* env) 2516 { 2517 Rex_t* e; 2518 Rex_t* f; 2519 Token_t tok; 2520 int c; 2521 int i; 2522 int n; 2523 int x; 2524 int parno; 2525 int type; 2526 regflags_t flags; 2527 unsigned char* s; 2528 unsigned char* p; 2529 unsigned char* t; 2530 unsigned char* u; 2531 unsigned char buf[256]; 2532 2533 for (;;) 2534 { 2535 s = buf; 2536 while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len]) 2537 { 2538 x = c; 2539 p = env->cursor; 2540 if (c >= 0) 2541 { 2542 n = 1; 2543 *s++ = (env->flags & REG_ICASE) ? toupper(c) : c; 2544 } 2545 else if (c == C_ESC || (env->flags & REG_ICASE)) 2546 { 2547 c = (c == C_ESC) ? env->token.lex : mbchar(p); 2548 if (env->flags & REG_ICASE) 2549 c = towupper(c); 2550 if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX) 2551 break; 2552 if ((n = wctomb((char*)s, c)) < 0) 2553 *s++ = c; 2554 else if (n) 2555 s += n; 2556 else 2557 { 2558 n = 1; 2559 *s++ = 0; 2560 } 2561 } 2562 else 2563 { 2564 n = env->token.len - env->token.esc; 2565 for (t = p, u = s + n; s < u; *s++ = *t++); 2566 } 2567 eat(env); 2568 } 2569 if (c == T_BAD) 2570 return 0; 2571 if (s > buf) 2572 switch (c) 2573 { 2574 case T_STAR: 2575 case T_PLUS: 2576 case T_LEFT: 2577 case T_QUES: 2578 case T_BANG: 2579 if ((s -= n) == buf) 2580 e = 0; 2581 else 2582 { 2583 i = s - buf; 2584 if (!(e = node(env, REX_STRING, 0, 0, i))) 2585 return 0; 2586 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i); 2587 e->re.string.size = i; 2588 } 2589 if (x >= 0) 2590 { 2591 if (!(f = node(env, REX_ONECHAR, 1, 1, 0))) 2592 { 2593 drop(env->disc, e); 2594 return 0; 2595 } 2596 f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x; 2597 } 2598 else 2599 { 2600 if (!(f = node(env, REX_STRING, 0, 0, n))) 2601 return 0; 2602 memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n); 2603 f->re.string.size = n; 2604 } 2605 if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env)))) 2606 { 2607 drop(env->disc, e); 2608 return 0; 2609 } 2610 if (e) 2611 f = cat(env, e, f); 2612 return f; 2613 default: 2614 c = s - buf; 2615 if (!(e = node(env, REX_STRING, 0, 0, c))) 2616 return 0; 2617 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c); 2618 e->re.string.size = c; 2619 return cat(env, e, seq(env)); 2620 } 2621 else if (c > T_BACK) 2622 { 2623 eat(env); 2624 c -= T_BACK; 2625 if (c > env->parno || !env->paren[c]) 2626 { 2627 env->error = REG_ESUBREG; 2628 return 0; 2629 } 2630 env->paren[c]->re.group.back = 1; 2631 e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 2632 } 2633 else 2634 switch (c) 2635 { 2636 case T_AND: 2637 case T_CLOSE: 2638 case T_BAR: 2639 case T_END: 2640 return node(env, REX_NULL, 0, 0, 0); 2641 case T_DOLL: 2642 eat(env); 2643 e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0); 2644 break; 2645 case T_CFLX: 2646 eat(env); 2647 if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED)) 2648 e = rep(env, e, 0, 0); 2649 break; 2650 case T_OPEN: 2651 tok = env->token; 2652 eat(env); 2653 flags = env->flags; 2654 type = env->type; 2655 if (env->token.att) 2656 env->flags |= REG_MINIMAL; 2657 env->parnest++; 2658 if (env->type == KRE) 2659 ++env->parno; 2660 parno = ++env->parno; 2661 if (!(e = alt(env, parno + 1, 0))) 2662 break; 2663 if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL)) 2664 { 2665 drop(env->disc, e); 2666 env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL; 2667 return 0; 2668 } 2669 if (token(env) != T_CLOSE) 2670 { 2671 drop(env->disc, e); 2672 env->error = REG_EPAREN; 2673 return 0; 2674 } 2675 env->parnest--; 2676 eat(env); 2677 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2678 { 2679 drop(env->disc, e); 2680 return 0; 2681 } 2682 if (parno < elementsof(env->paren)) 2683 env->paren[parno] = f; 2684 f->re.group.back = 0; 2685 f->re.group.number = parno; 2686 f->re.group.expr.rex = e; 2687 if (tok.lex) 2688 { 2689 tok.push = 1; 2690 env->token = tok; 2691 } 2692 if (!(e = rep(env, f, parno, env->parno))) 2693 return 0; 2694 if (env->type == KRE) 2695 { 2696 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2697 { 2698 drop(env->disc, e); 2699 return 0; 2700 } 2701 if (--parno < elementsof(env->paren)) 2702 env->paren[parno] = f; 2703 f->re.group.back = 0; 2704 f->re.group.number = parno; 2705 f->re.group.expr.rex = e; 2706 e = f; 2707 } 2708 env->flags = flags; 2709 env->type = type; 2710 break; 2711 case T_GROUP: 2712 p = env->cursor; 2713 eat(env); 2714 flags = env->flags; 2715 type = env->type; 2716 if (!(e = grp(env, env->parno + 1))) 2717 { 2718 if (env->error) 2719 return 0; 2720 if (env->literal == env->pattern && env->literal == p) 2721 env->literal = env->cursor; 2722 continue; 2723 } 2724 env->flags = flags; 2725 env->type = type; 2726 break; 2727 case T_BRA: 2728 eat(env); 2729 if (e = bra(env)) 2730 e = rep(env, e, 0, 0); 2731 break; 2732 case T_ALNUM: 2733 case T_ALNUM_NOT: 2734 case T_DIGIT: 2735 case T_DIGIT_NOT: 2736 case T_SPACE: 2737 case T_SPACE_NOT: 2738 eat(env); 2739 if (e = ccl(env, c)) 2740 e = rep(env, e, 0, 0); 2741 break; 2742 case T_LT: 2743 eat(env); 2744 e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0); 2745 break; 2746 case T_GT: 2747 eat(env); 2748 e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0); 2749 break; 2750 case T_DOT: 2751 eat(env); 2752 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 2753 break; 2754 case T_DOTSTAR: 2755 eat(env); 2756 env->token.lex = T_STAR; 2757 env->token.push = 1; 2758 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 2759 break; 2760 case T_SLASHPLUS: 2761 eat(env); 2762 env->token.lex = T_PLUS; 2763 env->token.push = 1; 2764 if (e = node(env, REX_ONECHAR, 1, 1, 0)) 2765 { 2766 e->re.onechar = '/'; 2767 e = rep(env, e, 0, 0); 2768 } 2769 break; 2770 case T_WORD: 2771 eat(env); 2772 e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0); 2773 break; 2774 case T_WORD_NOT: 2775 eat(env); 2776 e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0); 2777 break; 2778 case T_BEG_STR: 2779 eat(env); 2780 e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0); 2781 break; 2782 case T_END_STR: 2783 eat(env); 2784 e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0); 2785 break; 2786 case T_FIN_STR: 2787 eat(env); 2788 e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0); 2789 break; 2790 default: 2791 env->error = REG_BADRPT; 2792 return 0; 2793 } 2794 if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator) 2795 e = cat(env, e, seq(env)); 2796 return e; 2797 } 2798 } 2799 2800 static Rex_t* 2801 con(Cenv_t* env) 2802 { 2803 Rex_t* e; 2804 Rex_t* f; 2805 Rex_t* g; 2806 2807 if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND) 2808 return e; 2809 eat(env); 2810 if (!(f = con(env))) 2811 { 2812 drop(env->disc, e); 2813 return 0; 2814 } 2815 if (!(g = node(env, REX_CONJ, 0, 0, 0))) 2816 { 2817 drop(env->disc, e); 2818 drop(env->disc, f); 2819 return 0; 2820 } 2821 g->re.group.expr.binary.left = e; 2822 g->re.group.expr.binary.right = f; 2823 return g; 2824 } 2825 2826 static Rex_t* 2827 alt(Cenv_t* env, int number, int cond) 2828 { 2829 Rex_t* e; 2830 Rex_t* f; 2831 Rex_t* g; 2832 2833 if (!(e = con(env))) 2834 return 0; 2835 else if (token(env) != T_BAR) 2836 { 2837 if (!cond) 2838 return e; 2839 f = 0; 2840 if (e->type == REX_NULL) 2841 goto bad; 2842 } 2843 else 2844 { 2845 eat(env); 2846 if (!(f = alt(env, number, 0))) 2847 { 2848 drop(env->disc, e); 2849 return 0; 2850 } 2851 if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL)) 2852 goto bad; 2853 if (!cond && (g = trie(env, e, f))) 2854 return g; 2855 } 2856 if (!(g = node(env, REX_ALT, 0, 0, 0))) 2857 { 2858 env->error = REG_ESPACE; 2859 goto bad; 2860 } 2861 g->re.group.number = number; 2862 g->re.group.last = env->parno; 2863 g->re.group.expr.binary.left = e; 2864 g->re.group.expr.binary.right = f; 2865 return g; 2866 bad: 2867 drop(env->disc, e); 2868 drop(env->disc, f); 2869 if (!env->error) 2870 env->error = REG_ENULL; 2871 return 0; 2872 } 2873 2874 /* 2875 * add v to REX_BM tables 2876 */ 2877 2878 static void 2879 bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b) 2880 { 2881 int c; 2882 int m; 2883 size_t z; 2884 2885 for (m = 0; m < n; m++) 2886 { 2887 if (!(z = n - m - 1)) 2888 z = HIT; 2889 c = v[m]; 2890 a->re.bm.mask[m][c] |= b; 2891 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 2892 a->re.bm.skip[c] = z; 2893 if (a->flags & REG_ICASE) 2894 { 2895 if (isupper(c)) 2896 c = tolower(c); 2897 else if (islower(c)) 2898 c = toupper(c); 2899 else 2900 continue; 2901 a->re.bm.mask[m][c] |= b; 2902 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 2903 a->re.bm.skip[c] = z; 2904 } 2905 } 2906 } 2907 2908 /* 2909 * set up BM table from trie 2910 */ 2911 2912 static int 2913 bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b) 2914 { 2915 do 2916 { 2917 v[m] = x->c; 2918 if (m >= (n - 1)) 2919 { 2920 bmstr(env, a, v, n, b); 2921 if (!(b <<= 1)) 2922 { 2923 b = 1; 2924 a->re.bm.complete = 0; 2925 } 2926 else if (x->son) 2927 a->re.bm.complete = 0; 2928 } 2929 else if (x->son) 2930 b = bmtrie(env, a, v, x->son, n, m + 1, b); 2931 } while (x = x->sib); 2932 return b; 2933 } 2934 2935 /* 2936 * rewrite the expression tree for some special cases 2937 * 1. it is a null expression - illegal 2938 * 2. max length fixed string found -- use BM algorithm 2939 * 3. it begins with an unanchored string - use KMP algorithm 2940 * 0 returned on success 2941 */ 2942 2943 static int 2944 special(Cenv_t* env, regex_t* p) 2945 { 2946 Rex_t* a; 2947 Rex_t* e; 2948 Rex_t* t; 2949 Rex_t* x; 2950 Rex_t* y; 2951 unsigned char* s; 2952 int* f; 2953 int n; 2954 int m; 2955 int k; 2956 2957 DEBUG_INIT(); 2958 if (e = p->env->rex) 2959 { 2960 if ((x = env->stats.x) && x->re.string.size < 3) 2961 x = 0; 2962 if ((t = env->stats.y) && t->re.trie.min < 3) 2963 t = 0; 2964 if (x && t) 2965 { 2966 if (x->re.string.size >= t->re.trie.min) 2967 t = 0; 2968 else 2969 x = 0; 2970 } 2971 if (x || t) 2972 { 2973 Bm_mask_t** mask; 2974 Bm_mask_t* h; 2975 unsigned char* v; 2976 size_t* q; 2977 unsigned long l; 2978 int i; 2979 int j; 2980 2981 if (x) 2982 { 2983 y = x; 2984 n = m = x->re.string.size; 2985 l = env->stats.l; 2986 } 2987 else 2988 { 2989 y = t; 2990 n = t->re.trie.min; 2991 m = t->re.trie.max; 2992 l = env->stats.k; 2993 } 2994 if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t)))) 2995 return 1; 2996 if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t)))) 2997 { 2998 alloc(env->disc, q, 0); 2999 return 1; 3000 } 3001 a->flags = y->flags; 3002 a->map = y->map; 3003 a->re.bm.size = n; 3004 a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1; 3005 a->re.bm.left = l - 1; 3006 a->re.bm.right = env->stats.m - l - n; 3007 a->re.bm.complete = (y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n; 3008 h = (Bm_mask_t*)&a->re.bm.mask[n]; 3009 a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1)); 3010 a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1]; 3011 for (m = 0; m <= UCHAR_MAX; m++) 3012 a->re.bm.skip[m] = n; 3013 a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left); 3014 for (i = 1; i <= n; i++) 3015 a->re.bm.fail[i] = 2 * n - i; 3016 mask = a->re.bm.mask; 3017 for (m = 0; m < n; m++) 3018 { 3019 mask[m] = h; 3020 h += UCHAR_MAX + 1; 3021 } 3022 if (x) 3023 bmstr(env, a, x->re.string.base, n, 1); 3024 else 3025 { 3026 v = (unsigned char*)q; 3027 memset(v, 0, n); 3028 m = 1; 3029 for (i = 0; i <= UCHAR_MAX; i++) 3030 if (t->re.trie.root[i]) 3031 m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m); 3032 } 3033 mask--; 3034 memset(q, 0, n * sizeof(*q)); 3035 for (k = (j = n) + 1; j > 0; j--, k--) 3036 { 3037 DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0)); 3038 for (q[j] = k; k <= n; k = q[k]) 3039 { 3040 for (m = 0; m <= UCHAR_MAX; m++) 3041 if (mask[k][m] == mask[j][m]) 3042 { 3043 DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0)); 3044 goto cut; 3045 } 3046 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0)); 3047 if (a->re.bm.fail[k] > n - j) 3048 a->re.bm.fail[k] = n - j; 3049 } 3050 cut: ; 3051 } 3052 for (i = 1; i <= n; i++) 3053 if (a->re.bm.fail[i] > n + k - i) 3054 { 3055 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0)); 3056 a->re.bm.fail[i] = n + k - i; 3057 } 3058 #if _AST_REGEX_DEBUG 3059 if (DEBUG_TEST(0x0020,(1),(0))) 3060 { 3061 sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0); 3062 for (m = 0; m < n; m++) 3063 for (i = 1; i <= UCHAR_MAX; i++) 3064 if (a->re.bm.mask[m][i]) 3065 sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]); 3066 for (i = ' '; i <= UCHAR_MAX; i++) 3067 if (a->re.bm.skip[i] >= HIT) 3068 sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i); 3069 else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n) 3070 sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]); 3071 for (j = 31; j >= 0; j--) 3072 { 3073 sfprintf(sfstderr, " "); 3074 next: 3075 for (m = 0; m < n; m++) 3076 { 3077 for (i = 0040; i < 0177; i++) 3078 if (a->re.bm.mask[m][i] & (1 << j)) 3079 { 3080 sfprintf(sfstderr, " %c", i); 3081 break; 3082 } 3083 if (i >= 0177) 3084 { 3085 if (j > 0) 3086 { 3087 j--; 3088 goto next; 3089 } 3090 sfprintf(sfstderr, " ?"); 3091 } 3092 } 3093 sfprintf(sfstderr, "\n"); 3094 } 3095 sfprintf(sfstderr, "FAIL: "); 3096 for (m = 1; m <= n; m++) 3097 sfprintf(sfstderr, "%3d", a->re.bm.fail[m]); 3098 sfprintf(sfstderr, "\n"); 3099 } 3100 #endif 3101 alloc(env->disc, q, 0); 3102 a->next = e; 3103 p->env->rex = a; 3104 return 0; 3105 } 3106 switch (e->type) 3107 { 3108 case REX_BEG: 3109 if (env->flags & REG_NEWLINE) 3110 return 0; 3111 break; 3112 case REX_GROUP: 3113 if (env->stats.b) 3114 return 0; 3115 e = e->re.group.expr.rex; 3116 if (e->type != REX_DOT) 3117 return 0; 3118 /*FALLTHROUGH*/ 3119 case REX_DOT: 3120 if (e->lo == 0 && e->hi == RE_DUP_INF) 3121 break; 3122 return 0; 3123 case REX_NULL: 3124 if (env->flags & REG_NULL) 3125 break; 3126 env->error = REG_ENULL; 3127 return 1; 3128 case REX_STRING: 3129 if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map) 3130 return 0; 3131 s = e->re.string.base; 3132 n = e->re.string.size; 3133 if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1)))) 3134 return 1; 3135 a->flags = e->flags; 3136 a->map = e->map; 3137 f = a->re.string.fail; 3138 memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n); 3139 s = a->re.string.base; 3140 a->re.string.size = n; 3141 f[0] = m = -1; 3142 for (k = 1; k < n; k++) 3143 { 3144 while (m >= 0 && s[m+1] != s[k]) 3145 m = f[m]; 3146 if (s[m+1] == s[k]) 3147 m++; 3148 f[k] = m; 3149 } 3150 a->next = e->next; 3151 p->env->rex = a; 3152 e->next = 0; 3153 drop(env->disc, e); 3154 break; 3155 default: 3156 return 0; 3157 } 3158 } 3159 p->env->once = 1; 3160 return 0; 3161 } 3162 3163 int 3164 regcomp(regex_t* p, const char* pattern, regflags_t flags) 3165 { 3166 Rex_t* e; 3167 Rex_t* f; 3168 regdisc_t* disc; 3169 unsigned char* fold; 3170 int i; 3171 Cenv_t env; 3172 3173 if (!p) 3174 return REG_BADPAT; 3175 if (flags & REG_DISCIPLINE) 3176 { 3177 flags &= ~REG_DISCIPLINE; 3178 disc = p->re_disc; 3179 } 3180 else 3181 disc = &state.disc; 3182 if (!disc->re_errorlevel) 3183 disc->re_errorlevel = 2; 3184 p->env = 0; 3185 if (!pattern) 3186 return fatal(disc, REG_BADPAT, pattern); 3187 if (!state.initialized) 3188 { 3189 state.initialized = 1; 3190 for (i = 0; i < elementsof(state.escape); i++) 3191 state.magic[state.escape[i].key] = state.escape[i].val; 3192 } 3193 if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data)) 3194 { 3195 if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1))) 3196 return fatal(disc, REG_ESPACE, pattern); 3197 for (i = 0; i <= UCHAR_MAX; i++) 3198 fold[i] = toupper(i); 3199 LCINFO(AST_LC_CTYPE)->data = (void*)fold; 3200 } 3201 again: 3202 if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t)))) 3203 return fatal(disc, REG_ESPACE, pattern); 3204 memset(p->env, 0, sizeof(*p->env)); 3205 memset(&env, 0, sizeof(env)); 3206 env.regex = p; 3207 env.flags = flags; 3208 env.disc = p->env->disc = disc; 3209 if (env.flags & REG_AUGMENTED) 3210 env.flags |= REG_EXTENDED; 3211 env.mappeddot = '.'; 3212 env.mappednewline = '\n'; 3213 env.mappedslash = '/'; 3214 if (disc->re_version >= REG_VERSION_MAP && disc->re_map) 3215 { 3216 env.map = disc->re_map; 3217 env.MAP = p->env->fold; 3218 for (i = 0; i <= UCHAR_MAX; i++) 3219 { 3220 env.MAP[i] = fold[env.map[i]]; 3221 if (env.map[i] == '.') 3222 env.mappeddot = i; 3223 if (env.map[i] == '\n') 3224 env.mappednewline = i; 3225 if (env.map[i] == '/') 3226 env.mappedslash = i; 3227 } 3228 } 3229 else 3230 env.MAP = fold; 3231 env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE; 3232 env.explicit = -1; 3233 if (env.flags & REG_SHELL) 3234 { 3235 if (env.flags & REG_SHELL_PATH) 3236 env.explicit = env.mappedslash; 3237 env.flags |= REG_LENIENT|REG_NULL; 3238 env.type = env.type == BRE ? SRE : KRE; 3239 } 3240 if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE) 3241 env.explicit = env.mappednewline; 3242 p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1; 3243 env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL)); 3244 env.regexp = !!(env.flags & REG_REGEXP); 3245 env.token.lex = 0; 3246 env.token.push = 0; 3247 if (env.flags & REG_DELIMITED) 3248 { 3249 switch (env.delimiter = *pattern++) 3250 { 3251 case 0: 3252 case '\\': 3253 case '\n': 3254 case '\r': 3255 env.error = REG_EDELIM; 3256 goto bad; 3257 } 3258 env.terminator = '\n'; 3259 } 3260 env.literal = env.pattern = env.cursor = (unsigned char*)pattern; 3261 if (!(p->env->rex = alt(&env, 1, 0))) 3262 goto bad; 3263 if (env.parnest) 3264 { 3265 env.error = REG_EPAREN; 3266 goto bad; 3267 } 3268 p->env->stats.re_flags = env.flags & (REG_EXTENDED|REG_AUGMENTED|REG_SHELL); 3269 if (env.flags & REG_LEFT) 3270 { 3271 if (p->env->rex->type != REX_BEG) 3272 { 3273 if (p->env->rex->type == REX_ALT) 3274 env.flags &= ~REG_FIRST; 3275 if (!(e = node(&env, REX_BEG, 0, 0, 0))) 3276 { 3277 regfree(p); 3278 return fatal(disc, REG_ESPACE, pattern); 3279 } 3280 e->next = p->env->rex; 3281 p->env->rex = e; 3282 p->env->once = 1; 3283 } 3284 p->env->stats.re_flags |= REG_LEFT; 3285 } 3286 for (e = p->env->rex; e->next; e = e->next); 3287 p->env->done.type = REX_DONE; 3288 p->env->done.flags = e->flags; 3289 if (env.flags & REG_RIGHT) 3290 { 3291 if (e->type != REX_END) 3292 { 3293 if (p->env->rex->type == REX_ALT) 3294 env.flags &= ~REG_FIRST; 3295 if (!(f = node(&env, REX_END, 0, 0, 0))) 3296 { 3297 regfree(p); 3298 return fatal(disc, REG_ESPACE, pattern); 3299 } 3300 f->flags = e->flags; 3301 f->map = e->map; 3302 e->next = f; 3303 } 3304 p->env->stats.re_flags |= REG_RIGHT; 3305 } 3306 if (stats(&env, p->env->rex)) 3307 { 3308 if (!env.error) 3309 env.error = REG_ECOUNT; 3310 goto bad; 3311 } 3312 if (env.stats.b) 3313 p->env->hard = p->env->separate = 1; 3314 else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c))) 3315 p->env->hard = 1; 3316 if (p->env->hard || env.stats.c || env.stats.i) 3317 p->env->stats.re_min = p->env->stats.re_max = -1; 3318 else 3319 { 3320 if (!(p->env->stats.re_min = env.stats.m)) 3321 p->env->stats.re_min = -1; 3322 if (!(p->env->stats.re_max = env.stats.n)) 3323 p->env->stats.re_max = -1; 3324 } 3325 if (special(&env, p)) 3326 goto bad; 3327 serialize(&env, p->env->rex, 1); 3328 p->re_nsub = env.stats.p; 3329 if (env.type == KRE) 3330 p->re_nsub /= 2; 3331 if (env.flags & REG_DELIMITED) 3332 { 3333 p->re_npat = env.cursor - env.pattern + 1; 3334 if (*env.cursor == env.delimiter) 3335 p->re_npat++; 3336 else if (env.flags & REG_MUSTDELIM) 3337 { 3338 env.error = REG_EDELIM; 3339 goto bad; 3340 } 3341 else 3342 env.flags &= ~REG_DELIMITED; 3343 } 3344 p->env->explicit = env.explicit; 3345 p->env->flags = env.flags & REG_COMP; 3346 p->env->min = env.stats.m; 3347 p->env->nsub = env.stats.p + env.stats.u; 3348 p->env->refs = 1; 3349 return 0; 3350 bad: 3351 regfree(p); 3352 if (!env.error) 3353 env.error = REG_ESPACE; 3354 if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL)) 3355 { 3356 flags |= REG_LITERAL; 3357 pattern = (const char*)env.literal; 3358 goto again; 3359 } 3360 return fatal(disc, env.error, pattern); 3361 } 3362 3363 /* 3364 * regcomp() on sized pattern 3365 * the lazy way around adding and checking env.end 3366 */ 3367 3368 int 3369 regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags) 3370 { 3371 char* s; 3372 int r; 3373 3374 if (!(s = malloc(size + 1))) 3375 return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern); 3376 memcpy(s, pattern, size); 3377 s[size] = 0; 3378 r = regcomp(p, s, flags); 3379 free(s); 3380 return r; 3381 } 3382 3383 /* 3384 * combine two compiled regular expressions if possible, 3385 * replacing first with the combination and freeing second. 3386 * return 0 on success. 3387 * the only combinations handled are building a Trie 3388 * from String|Kmp|Trie and String|Kmp 3389 */ 3390 3391 int 3392 regcomb(regex_t* p, regex_t* q) 3393 { 3394 Rex_t* e = p->env->rex; 3395 Rex_t* f = q->env->rex; 3396 Rex_t* g; 3397 Cenv_t env; 3398 3399 if (!e || !f) 3400 return fatal(p->env->disc, REG_BADPAT, NiL); 3401 if (p->env->separate || q->env->separate) 3402 return REG_ESUBREG; 3403 memset(&env, 0, sizeof(env)); 3404 env.disc = p->env->disc; 3405 if (e->type == REX_BM) 3406 { 3407 p->env->rex = e->next; 3408 e->next = 0; 3409 drop(env.disc, e); 3410 e = p->env->rex; 3411 } 3412 if (f->type == REX_BM) 3413 { 3414 q->env->rex = f->next; 3415 f->next = 0; 3416 drop(env.disc, f); 3417 f = q->env->rex; 3418 } 3419 if (e->type == REX_BEG && f->type == REX_BEG) 3420 { 3421 p->env->flags |= REG_LEFT; 3422 p->env->rex = e->next; 3423 e->next = 0; 3424 drop(env.disc, e); 3425 e = p->env->rex; 3426 q->env->rex = f->next; 3427 f->next = 0; 3428 drop(env.disc, f); 3429 f = q->env->rex; 3430 } 3431 if (e->next && e->next->type == REX_END && f->next && f->next->type == REX_END) 3432 { 3433 p->env->flags |= REG_RIGHT; 3434 drop(env.disc, e->next); 3435 e->next = 0; 3436 drop(env.disc, f->next); 3437 f->next = 0; 3438 } 3439 if (!(g = trie(&env, f, e))) 3440 return fatal(p->env->disc, REG_BADPAT, NiL); 3441 p->env->rex = g; 3442 if (!q->env->once) 3443 p->env->once = 0; 3444 q->env->rex = 0; 3445 if (p->env->flags & REG_LEFT) 3446 { 3447 if (!(e = node(&env, REX_BEG, 0, 0, 0))) 3448 { 3449 regfree(p); 3450 return fatal(p->env->disc, REG_ESPACE, NiL); 3451 } 3452 e->next = p->env->rex; 3453 p->env->rex = e; 3454 p->env->once = 1; 3455 } 3456 if (p->env->flags & REG_RIGHT) 3457 { 3458 for (f = p->env->rex; f->next; f = f->next); 3459 if (f->type != REX_END) 3460 { 3461 if (!(e = node(&env, REX_END, 0, 0, 0))) 3462 { 3463 regfree(p); 3464 return fatal(p->env->disc, REG_ESPACE, NiL); 3465 } 3466 f->next = e; 3467 } 3468 } 3469 env.explicit = p->env->explicit; 3470 env.flags = p->env->flags; 3471 env.disc = p->env->disc; 3472 if (stats(&env, p->env->rex)) 3473 { 3474 regfree(p); 3475 return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL); 3476 } 3477 if (special(&env, p)) 3478 { 3479 regfree(p); 3480 return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL); 3481 } 3482 p->env->min = g->re.trie.min; 3483 return 0; 3484 } 3485 3486 /* 3487 * copy a reference of p into q 3488 * p and q may then have separate regsubcomp() instantiations 3489 */ 3490 3491 int 3492 regdup(regex_t* p, regex_t* q) 3493 { 3494 if (!p || !q) 3495 return REG_BADPAT; 3496 *q = *p; 3497 p->env->refs++; 3498 q->re_sub = 0; 3499 return 0; 3500 } 3501