1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2010 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * posix regex compiler 26 */ 27 28 #include "reglib.h" 29 30 #if _PACKAGE_ast 31 #include "lclib.h" 32 #endif 33 34 #define serialize re_serialize /* hp.ia64 <unistd.h>! */ 35 36 #define C_ESC (-1) 37 #define C_MB (-2) 38 39 #if _AST_REGEX_DEBUG 40 41 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) 42 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) 43 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0) 44 45 static unsigned long debug; 46 static unsigned long debug_flag; 47 48 #else 49 50 #define DEBUG_INIT() 51 #define DEBUG_TEST(f,y,n) (n) 52 #define DEBUG_CODE(f,y,n) do {n} while(0) 53 54 #endif 55 56 #if _PACKAGE_ast 57 58 typedef struct Cchr_s 59 { 60 Dtlink_t lnk; 61 unsigned char nam[2]; 62 Ckey_t key; 63 } Cchr_t; 64 65 #endif 66 67 #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0) 68 69 /* 70 * determine whether greedy matching will work, i.e. produce 71 * the best match first. such expressions are "easy", and 72 * need no backtracking once a complete match is found. 73 * if an expression has backreferences or alts it's hard 74 * else if it has only one closure it's easy 75 * else if all closures are simple (i.e. one-character) it's easy 76 * else it's hard. 77 */ 78 79 typedef struct Stats_s 80 { 81 unsigned long l; /* min length to left of x */ 82 unsigned long k; /* min length to left of y */ 83 unsigned long m; /* min length */ 84 unsigned long n; /* max length */ 85 unsigned short a; /* number of alternations */ 86 unsigned short b; /* number of backrefs */ 87 unsigned short c; /* number of closures */ 88 unsigned short e; /* $ */ 89 unsigned short i; /* number of negations */ 90 unsigned short p; /* number of named subexpressions */ 91 unsigned short s; /* number of simple closures */ 92 unsigned short t; /* number of tries */ 93 unsigned short u; /* number of unnamed subexpressions */ 94 Rex_t* x; /* max length REX_STRING */ 95 Rex_t* y; /* max length REX_TRIE */ 96 } Stats_t; 97 98 typedef struct Token_s 99 { 100 unsigned long min; 101 unsigned long max; 102 short lex; 103 short len; 104 short esc; 105 short att; 106 short push; 107 } Token_t; 108 109 typedef struct Cenv_s 110 { 111 int delimiter; /* pattern delimiter */ 112 int error; /* last error */ 113 int explicit; /* explicit match on this char */ 114 int mappeddot; /* inverse mapped '.' */ 115 int mappednewline; /* inverse mapped '\n' */ 116 int mappedslash; /* inverse mapped '/' */ 117 regflags_t flags; /* flags arg to regcomp */ 118 int type; /* BRE,ERE,ARE,SRE,KRE */ 119 unsigned char* cursor; /* curent point in re */ 120 unsigned char* pattern; /* the original pattern */ 121 unsigned char* literal; /* literal restart pattern */ 122 int parno; /* number of last open paren */ 123 int parnest; /* paren nest count */ 124 int posixkludge; /* to make * nonspecial */ 125 int regexp; /* <regexp.h> compatibility */ 126 Token_t token; /* token lookahead */ 127 Stats_t stats; /* RE statistics */ 128 int terminator; /* pattern terminator */ 129 Rex_t* paren[2*(BACK_REF_MAX+2)]; 130 /* paren[i]!=0 if \i defined */ 131 regex_t* regex; /* user handle */ 132 regdisc_t* disc; /* user discipline */ 133 unsigned char* map; /* external to native ccode map */ 134 unsigned char* MAP; /* fold and/or map */ 135 } Cenv_t; 136 137 /* 138 * allocate a new Rex_t node 139 */ 140 141 static Rex_t* 142 node(Cenv_t* env, int type, int lo, int hi, size_t extra) 143 { 144 register Rex_t* e; 145 146 if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra)) 147 { 148 memset(e, 0, sizeof(Rex_t) + extra); 149 e->type = type; 150 e->marked = 0; 151 e->lo = lo; 152 e->hi = hi; 153 e->flags = env->flags; 154 e->map = (e->flags & REG_ICASE) ? env->MAP : env->map; 155 e->explicit = env->explicit; 156 if (extra) 157 e->re.data = (char*)e + sizeof(Rex_t); 158 } 159 return e; 160 } 161 162 /* 163 * free a Trie_node_t node 164 */ 165 166 static void 167 triedrop(regdisc_t* disc, Trie_node_t* e) 168 { 169 if (e) 170 { 171 triedrop(disc, e->son); 172 triedrop(disc, e->sib); 173 alloc(disc, e, 0); 174 } 175 } 176 177 /* 178 * free a Rex_t node 179 */ 180 181 void 182 drop(regdisc_t* disc, Rex_t* e) 183 { 184 int i; 185 Rex_t* x; 186 187 if (e && !(disc->re_flags & REG_NOFREE)) 188 do 189 { 190 switch (e->type) 191 { 192 case REX_ALT: 193 case REX_CONJ: 194 drop(disc, e->re.group.expr.binary.left); 195 drop(disc, e->re.group.expr.binary.right); 196 break; 197 case REX_GROUP: 198 case REX_GROUP_AHEAD: 199 case REX_GROUP_AHEAD_NOT: 200 case REX_GROUP_BEHIND: 201 case REX_GROUP_BEHIND_NOT: 202 case REX_GROUP_CUT: 203 case REX_NEG: 204 case REX_REP: 205 drop(disc, e->re.group.expr.rex); 206 break; 207 case REX_TRIE: 208 for (i = 0; i <= UCHAR_MAX; i++) 209 triedrop(disc, e->re.trie.root[i]); 210 break; 211 } 212 x = e->next; 213 alloc(disc, e, 0); 214 } while (e = x); 215 } 216 217 /* 218 * mark e and descendants minimal 219 */ 220 221 static void 222 mark(register Rex_t* e, int set) 223 { 224 if (e && !e->marked) 225 do 226 { 227 e->marked = 1; 228 if (set) 229 e->flags |= REG_MINIMAL; 230 else 231 e->flags &= ~REG_MINIMAL; 232 switch (e->type) 233 { 234 case REX_ALT: 235 case REX_CONJ: 236 case REX_GROUP_COND: 237 if (e->re.group.expr.binary.left) 238 mark(e->re.group.expr.binary.left, set); 239 if (e->re.group.expr.binary.right) 240 mark(e->re.group.expr.binary.right, set); 241 break; 242 case REX_GROUP: 243 case REX_GROUP_AHEAD: 244 case REX_GROUP_AHEAD_NOT: 245 case REX_GROUP_BEHIND: 246 case REX_GROUP_BEHIND_NOT: 247 case REX_GROUP_CUT: 248 case REX_NEG: 249 case REX_REP: 250 case REX_TRIE: 251 mark(e->re.group.expr.rex, set); 252 break; 253 } 254 } while (e = e->next); 255 } 256 257 /* 258 * assign subexpression numbers by a preorder tree walk 259 */ 260 261 static int 262 serialize(Cenv_t* env, Rex_t* e, int n) 263 { 264 do 265 { 266 e->serial = n++; 267 switch (e->type) 268 { 269 case REX_ALT: 270 case REX_GROUP_COND: 271 if (e->re.group.expr.binary.left) 272 n = serialize(env, e->re.group.expr.binary.left, n); 273 e->re.group.expr.binary.serial = n++; 274 if (e->re.group.expr.binary.right) 275 n = serialize(env, e->re.group.expr.binary.right, n); 276 break; 277 case REX_CONJ: 278 n = serialize(env, e->re.group.expr.binary.left, n); 279 n = serialize(env, e->re.group.expr.binary.right, n); 280 break; 281 case REX_GROUP: 282 case REX_GROUP_AHEAD: 283 case REX_GROUP_AHEAD_NOT: 284 case REX_GROUP_BEHIND: 285 case REX_GROUP_BEHIND_NOT: 286 case REX_GROUP_CUT: 287 case REX_NEG: 288 case REX_REP: 289 n = serialize(env, e->re.group.expr.rex, n); 290 break; 291 } 292 } while (e = e->next); 293 return n; 294 } 295 296 /* 297 * catenate e and f into a sequence, collapsing them if possible 298 */ 299 300 static Rex_t* 301 cat(Cenv_t* env, Rex_t* e, Rex_t* f) 302 { 303 Rex_t* g; 304 305 if (!f) 306 { 307 drop(env->disc, e); 308 return 0; 309 } 310 if (e->type == REX_NULL) 311 { 312 drop(env->disc, e); 313 return f; 314 } 315 if (f->type == REX_NULL) 316 { 317 g = f->next; 318 f->next = 0; 319 drop(env->disc, f); 320 f = g; 321 } 322 else if (e->type == REX_DOT && f->type == REX_DOT) 323 { 324 unsigned int m = e->lo + f->lo; 325 unsigned int n = e->hi + f->hi; 326 327 if (m <= RE_DUP_MAX) 328 { 329 if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX) 330 { 331 n = RE_DUP_INF; 332 goto combine; 333 } 334 else if (n <= RE_DUP_MAX) 335 { 336 combine: 337 e->lo = m; 338 e->hi = n; 339 g = f->next; 340 f->next = 0; 341 drop(env->disc, f); 342 f = g; 343 } 344 } 345 } 346 e->next = f; 347 return e; 348 } 349 350 /* 351 * collect re statistics 352 */ 353 354 static int 355 stats(register Cenv_t* env, register Rex_t* e) 356 { 357 register unsigned long n; 358 register unsigned long m; 359 unsigned long cm; 360 unsigned long nm; 361 unsigned long cn; 362 unsigned long nn; 363 unsigned long l; 364 unsigned long k; 365 unsigned long t; 366 Rex_t* q; 367 Rex_t* x; 368 Rex_t* y; 369 unsigned char c; 370 unsigned char b; 371 372 do 373 { 374 switch (e->type) 375 { 376 case REX_ALT: 377 x = env->stats.x; 378 l = env->stats.l; 379 y = env->stats.y; 380 k = env->stats.k; 381 t = env->stats.t; 382 if (++env->stats.a <= 0) 383 return 1; 384 cm = env->stats.m; 385 env->stats.m = 0; 386 cn = env->stats.n; 387 env->stats.n = 0; 388 if (stats(env, e->re.group.expr.binary.left)) 389 return 1; 390 m = env->stats.m; 391 env->stats.m = 0; 392 n = env->stats.n; 393 env->stats.n = 0; 394 if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right)) 395 return 1; 396 if (env->stats.m > m) 397 env->stats.m = m; 398 else 399 m = env->stats.m; 400 if ((env->stats.m += cm) < m) 401 return 1; 402 if (env->stats.n < n) 403 env->stats.n = n; 404 else 405 n = env->stats.n; 406 if ((env->stats.n += cn) < n) 407 return 1; 408 env->stats.x = x; 409 env->stats.l = l; 410 env->stats.y = y; 411 env->stats.k = k; 412 env->stats.t = t; 413 break; 414 case REX_BACK: 415 if (++env->stats.b <= 0) 416 return 1; 417 break; 418 case REX_CLASS: 419 case REX_COLL_CLASS: 420 case REX_DOT: 421 case REX_ONECHAR: 422 n = env->stats.m; 423 if ((env->stats.m += e->lo) < n) 424 return 1; 425 if (e->hi != RE_DUP_INF) 426 { 427 n = env->stats.n; 428 if ((env->stats.n += e->hi) < n) 429 return 1; 430 } 431 if (e->lo != e->hi) 432 { 433 if (++env->stats.c <= 0) 434 return 1; 435 if (++env->stats.s <= 0) 436 return 1; 437 } 438 break; 439 case REX_CONJ: 440 cm = env->stats.m; 441 env->stats.m = 0; 442 cn = env->stats.n; 443 env->stats.n = 0; 444 if (stats(env, e->re.group.expr.binary.left)) 445 return 1; 446 nm = env->stats.m; 447 env->stats.m = 0; 448 nn = env->stats.n; 449 env->stats.n = 0; 450 if (stats(env, e->re.group.expr.binary.right)) 451 return 1; 452 if (env->stats.m < nm) 453 env->stats.m = nm; 454 else 455 nm = env->stats.m; 456 if ((env->stats.m += cm) < nm) 457 return 1; 458 if (env->stats.n < nn) 459 env->stats.n = nn; 460 else 461 nn = env->stats.n; 462 if ((env->stats.n += cn) < nn) 463 return 1; 464 break; 465 case REX_END: 466 env->stats.e = 1; 467 break; 468 case REX_GROUP: 469 if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0) 470 return 1; 471 if (stats(env, e->re.group.expr.rex)) 472 return 1; 473 break; 474 case REX_GROUP_AHEAD: 475 case REX_GROUP_AHEAD_NOT: 476 case REX_GROUP_BEHIND: 477 case REX_GROUP_BEHIND_NOT: 478 m = env->stats.m; 479 n = env->stats.n; 480 x = env->stats.x; 481 y = env->stats.y; 482 if (stats(env, e->re.group.expr.rex)) 483 return 1; 484 env->stats.m = m; 485 env->stats.n = n; 486 env->stats.x = x; 487 env->stats.y = y; 488 switch (e->type) 489 { 490 case REX_GROUP_AHEAD: 491 case REX_GROUP_BEHIND: 492 if (++env->stats.u <= 0) 493 return 1; 494 break; 495 } 496 break; 497 case REX_GROUP_COND: 498 if (++env->stats.u <= 0) 499 return 1; 500 m = env->stats.m; 501 n = env->stats.n; 502 x = env->stats.x; 503 y = env->stats.y; 504 if (e->re.group.size > 0 && ++env->stats.b <= 0) 505 return 1; 506 if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left)) 507 return 1; 508 if (q = e->re.group.expr.binary.right) 509 { 510 if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left)) 511 return 1; 512 if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right)) 513 return 1; 514 } 515 env->stats.m = m; 516 env->stats.n = n; 517 env->stats.x = x; 518 env->stats.y = y; 519 break; 520 case REX_GROUP_CUT: 521 if (++env->stats.u <= 0) 522 return 1; 523 m = env->stats.m; 524 n = env->stats.n; 525 x = env->stats.x; 526 y = env->stats.y; 527 if (stats(env, e->re.group.expr.rex)) 528 return 1; 529 env->stats.m = m; 530 env->stats.n = n; 531 env->stats.x = x; 532 env->stats.y = y; 533 break; 534 case REX_NEG: 535 env->stats.i++; 536 x = env->stats.x; 537 l = env->stats.l; 538 y = env->stats.y; 539 k = env->stats.k; 540 t = env->stats.t; 541 cm = env->stats.m; 542 env->stats.m = 0; 543 if (stats(env, e->re.group.expr.rex)) 544 return 1; 545 env->stats.m = !env->stats.m; 546 if ((env->stats.m += cm) < cm) 547 return 1; 548 env->stats.x = x; 549 env->stats.l = l; 550 env->stats.y = y; 551 env->stats.k = k; 552 env->stats.t = t; 553 break; 554 case REX_REP: 555 x = env->stats.x; 556 l = env->stats.l; 557 y = env->stats.y; 558 k = env->stats.k; 559 t = env->stats.t; 560 if (++env->stats.c <= 0) 561 return 1; 562 b = env->stats.b; 563 c = env->stats.c; 564 cm = env->stats.m; 565 env->stats.m = 0; 566 if (stats(env, e->re.group.expr.rex)) 567 return 1; 568 if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0) 569 return 1; 570 if (e->lo < 1) 571 { 572 env->stats.x = x; 573 env->stats.l = l; 574 env->stats.y = y; 575 env->stats.k = k; 576 env->stats.t = t; 577 env->stats.m = cm; 578 } 579 else 580 { 581 m = env->stats.m; 582 if ((env->stats.m *= e->lo) > 0 && env->stats.m < m) 583 return 1; 584 m = env->stats.m; 585 if ((env->stats.m += cm) < m) 586 return 1; 587 if (env->stats.x != x) 588 env->stats.l = cm; 589 if (env->stats.y != y) 590 env->stats.k = cm; 591 } 592 break; 593 case REX_STRING: 594 if (!e->map) 595 { 596 cm = env->stats.m; 597 if ((env->stats.m += e->re.string.size) < cm) 598 return 1; 599 cn = env->stats.n; 600 if ((env->stats.n += e->re.string.size) < cn) 601 return 1; 602 if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size) 603 { 604 env->stats.x = e; 605 env->stats.l = cm; 606 } 607 } 608 break; 609 case REX_TRIE: 610 if (++env->stats.s <= 0) 611 return 1; 612 cm = env->stats.m; 613 if ((env->stats.m += e->re.trie.min) < cm) 614 return 1; 615 cn = env->stats.n; 616 if ((env->stats.n += e->re.trie.max) < cn) 617 return 1; 618 env->stats.t++; 619 if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min) 620 { 621 env->stats.y = e; 622 env->stats.k = cm; 623 } 624 break; 625 } 626 } while (e = e->next); 627 return 0; 628 } 629 630 static int token(Cenv_t*); 631 632 static int 633 magic(register Cenv_t* env, register int c, int escaped) 634 { 635 register char* sp; 636 register int n; 637 int o = c; 638 int e = env->error; 639 int l = env->token.len; 640 short* mp; 641 char* ep; 642 643 if (mp = state.magic[c]) 644 { 645 c = mp[env->type+escaped]; 646 if (c >= T_META) 647 { 648 sp = (char*)env->cursor + env->token.len; 649 switch (c) 650 { 651 case T_LEFT: 652 n = 0; 653 ep = sp; 654 while (*sp >= '0' && *sp <= '9') 655 { 656 if (n > (INT_MAX / 10)) 657 { 658 env->error = REG_BADBR; 659 goto bad; 660 } 661 n = n * 10 + *sp++ - '0'; 662 } 663 if (sp == ep) 664 { 665 if (env->type < SRE || *sp != ',') 666 { 667 env->error = *sp ? REG_BADBR : REG_EBRACE; 668 goto bad; 669 } 670 } 671 else if (n > RE_DUP_MAX) 672 { 673 env->error = REG_BADBR; 674 goto bad; 675 } 676 env->token.min = n; 677 if (*sp == ',') 678 { 679 n = 0; 680 ep = ++sp; 681 while (*sp >= '0' && *sp <= '9') 682 { 683 if (n > (INT_MAX / 10)) 684 { 685 env->error = REG_BADBR; 686 goto bad; 687 } 688 n = n * 10 + *sp++ - '0'; 689 } 690 if (sp == ep) 691 n = RE_DUP_INF; 692 else if (n < env->token.min) 693 { 694 env->error = REG_BADBR; 695 goto bad; 696 } 697 } 698 env->token.max = n; 699 switch (*sp) 700 { 701 case 0: 702 env->error = REG_EBRACE; 703 goto bad; 704 case '\\': 705 if (!escaped) 706 { 707 env->error = REG_BADBR; 708 goto bad; 709 } 710 sp++; 711 break; 712 default: 713 if (escaped) 714 { 715 env->error = REG_BADBR; 716 goto bad; 717 } 718 break; 719 } 720 switch (*sp++) 721 { 722 case 0: 723 env->error = REG_EBRACE; 724 goto bad; 725 case '}': 726 break; 727 default: 728 env->error = REG_BADBR; 729 goto bad; 730 } 731 env->token.len = sp - (char*)env->cursor; 732 break; 733 case T_RIGHT: 734 env->error = REG_EBRACE; 735 goto bad; 736 case T_OPEN: 737 if (env->type < SRE && *sp == '?') 738 { 739 env->token.len++; 740 env->token.lex = 0; 741 goto group; 742 } 743 break; 744 case T_ESCAPE: 745 c = chresc(sp - 2, &ep); 746 if (ep < sp) 747 goto bad; 748 env->token.len += ep - sp; 749 if (c >= T_META) 750 { 751 env->token.lex = c; 752 c = C_ESC; 753 } 754 return c; 755 case T_BACK+0: 756 case T_BACK+1: 757 case T_BACK+2: 758 case T_BACK+3: 759 case T_BACK+4: 760 case T_BACK+5: 761 case T_BACK+6: 762 case T_BACK+7: 763 n = chresc(sp - 2, &ep); 764 if (ep > sp + 1) 765 { 766 env->token.len += ep - sp; 767 return n; 768 } 769 /*FALLTHROUGH*/ 770 case T_BACK+8: 771 case T_BACK+9: 772 if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT)) 773 { 774 env->error = REG_BADESC; 775 goto bad; 776 } 777 if ((env->flags & REG_MULTIREF) && isdigit(*sp)) 778 { 779 c = (c - T_BACK) * 10 + (*sp - '0'); 780 if (c > 0 && c <= env->parno && env->paren[c]) 781 c += T_BACK; 782 else 783 c = chresc(sp - 2, &ep); 784 env->token.len++; 785 } 786 if (c == T_BACK) 787 c = 0; 788 break; 789 case T_BAD: 790 if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META) 791 return c; 792 goto bad; 793 } 794 if (env->type >= SRE) 795 { 796 if (c == T_DOT) 797 c = '.'; 798 else if (c < T_OPEN) 799 { 800 if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(') 801 { 802 env->token.len++; 803 env->token.att = 1; 804 } 805 if (env->type == KRE && *(env->cursor + env->token.len) == '(') 806 { 807 env->token.len++; 808 switch (c) 809 { 810 case T_AT: 811 break; 812 case T_PERCENT: 813 env->token.lex = c; 814 goto group; 815 case T_TILDE: 816 env->token.lex = 0; 817 goto group; 818 default: 819 env->token.lex = c; 820 break; 821 } 822 c = T_OPEN; 823 } 824 else if (c == T_STAR) 825 c = T_DOTSTAR; 826 else if (c == T_QUES) 827 c = T_DOT; 828 else 829 { 830 c = o; 831 env->token.len = l; 832 } 833 } 834 else if (c > T_BACK) 835 { 836 c = (c - T_BACK) * 2 - 1; 837 c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c; 838 } 839 else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP)) 840 { 841 if (c == T_AND) 842 c = '&'; 843 else if (c == T_BAR) 844 c = '|'; 845 else if (c == T_OPEN) 846 c = '('; 847 } 848 } 849 } 850 } 851 else if (escaped == 2) 852 { 853 if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter)) 854 /*ok*/; 855 else 856 { 857 env->error = REG_BADESC; 858 goto bad; 859 } 860 } 861 else if (escaped && !(env->flags & REG_LENIENT) && c != ']') 862 { 863 env->error = REG_BADESC; 864 goto bad; 865 } 866 return c; 867 group: 868 sp = (char*)env->cursor + env->token.len; 869 switch (*sp++) 870 { 871 case ')': 872 break; 873 case '#': 874 for (;;) 875 { 876 switch (*sp++) 877 { 878 case 0: 879 env->error = REG_EPAREN; 880 return T_BAD; 881 case ')': 882 break; 883 default: 884 continue; 885 } 886 break; 887 } 888 break; 889 default: 890 return T_GROUP; 891 } 892 env->cursor = (unsigned char*)sp; 893 return token(env); 894 bad: 895 if (escaped == 2) 896 env->error = e; 897 else if (env->flags & REG_LENIENT) 898 return o; 899 else if (escaped == 1 && !env->error) 900 { 901 if (mp || o == ']') 902 return o; 903 env->error = REG_BADESC; 904 } 905 return T_BAD; 906 } 907 908 static int 909 token(register Cenv_t* env) 910 { 911 int c; 912 int posixkludge; 913 914 if (env->token.push) 915 return env->token.lex; 916 env->token.att = env->token.esc = 0; 917 if ((env->token.len = MBSIZE(env->cursor)) > 1) 918 return env->token.lex = C_MB; 919 env->token.lex = 0; 920 for (;;) 921 { 922 c = *env->cursor; 923 if (c == 0 || c == env->delimiter || c == env->terminator) 924 return T_END; 925 if (!(env->flags & REG_COMMENT)) 926 break; 927 if (c == '#') 928 { 929 do 930 { 931 c = *++env->cursor; 932 if (c == 0 || c == env->delimiter) 933 return T_END; 934 } while (c != '\n'); 935 } 936 else if (!isspace(c)) 937 break; 938 env->cursor++; 939 } 940 if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter) 941 { 942 if (env->parnest) 943 { 944 env->error = REG_EPAREN; 945 return T_BAD; 946 } 947 env->parno = 0; 948 env->pattern = env->cursor + 1; 949 return T_BAR; 950 } 951 if (env->flags & REG_LITERAL) 952 return c; 953 if (posixkludge = env->posixkludge) 954 { 955 env->posixkludge = 0; 956 if (c == '*') 957 return c; 958 } 959 if (c == '\\') 960 { 961 if (env->flags & REG_SHELL_ESCAPED) 962 return c; 963 if (!(c = *(env->cursor + 1)) || c == env->terminator) 964 { 965 if (env->flags & REG_LENIENT) 966 { 967 if (c) 968 { 969 env->token.esc = env->token.len; 970 env->token.len += MBSIZE(env->cursor + 1); 971 return c; 972 } 973 return '\\'; 974 } 975 env->error = REG_EESCAPE; 976 return T_BAD; 977 } 978 env->token.esc = env->token.len; 979 env->token.len += MBSIZE(env->cursor + 1); 980 if (env->delimiter && c == 'n') 981 return '\n'; 982 else if (c == env->delimiter) 983 return magic(env, c, 0); 984 else if (c == '(' && env->type == BRE) 985 env->posixkludge = 1; 986 else if (c == ')' && env->type == BRE && env->parnest <= 0) 987 { 988 env->error = REG_EPAREN; 989 return T_BAD; 990 } 991 else if (isspace(c) && (env->flags & REG_COMMENT)) 992 return c; 993 return magic(env, c, 1); 994 } 995 else if (c == '$') 996 { 997 if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n') 998 return T_DOLL; 999 } 1000 else if (c == '^') 1001 { 1002 if (env->type == BRE && (env->cursor == env->pattern || posixkludge == 1)) 1003 { 1004 env->posixkludge = 2; 1005 return T_CFLX; 1006 } 1007 } 1008 else if (c == ')') 1009 { 1010 if (env->type != BRE && env->parnest <= 0) 1011 return c; 1012 } 1013 else if (c == '/' && env->explicit == env->mappedslash) 1014 { 1015 while (*(env->cursor + env->token.len) == c) 1016 env->token.len++; 1017 return T_SLASHPLUS; 1018 } 1019 return magic(env, c, 0); 1020 } 1021 1022 static Celt_t* 1023 col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec) 1024 { 1025 register char* s; 1026 register unsigned char* k; 1027 register unsigned char* e; 1028 register int c; 1029 register int cc; 1030 int bt; 1031 int et; 1032 Ckey_t key; 1033 1034 cc = 0; 1035 for (;;) 1036 { 1037 k = key; 1038 if (bw == 1) 1039 { 1040 c = bc; 1041 if (ic) 1042 { 1043 if (isupper(c)) 1044 { 1045 c = tolower(c); 1046 cc = -1; 1047 } 1048 else if (islower(c)) 1049 { 1050 c = toupper(c); 1051 cc = -1; 1052 } 1053 } 1054 *k++ = c; 1055 } 1056 else if (bw < COLL_KEY_MAX) 1057 { 1058 s = (char*)bp; 1059 if (ic) 1060 { 1061 c = mbchar(s); 1062 if (iswupper(c)) 1063 { 1064 c = towlower(c); 1065 cc = 1; 1066 } 1067 else if (iswlower(c)) 1068 { 1069 c = towupper(c); 1070 cc = 1; 1071 } 1072 } 1073 if (cc > 0) 1074 { 1075 cc = -1; 1076 k += mbconv((char*)k, c); 1077 } 1078 else 1079 for (e = k + bw; k < e; *k++ = *s++); 1080 } 1081 *k = 0; 1082 mbxfrm(ce->beg, key, COLL_KEY_MAX); 1083 if (ep) 1084 { 1085 k = key; 1086 c = mbchar(k); 1087 if (iswupper(c)) 1088 bt = COLL_range_uc; 1089 else if (iswlower(c)) 1090 bt = COLL_range_lc; 1091 else 1092 bt = COLL_range; 1093 k = key; 1094 if (ew == 1) 1095 { 1096 c = ec; 1097 if (ic) 1098 { 1099 if (isupper(c)) 1100 { 1101 c = tolower(c); 1102 cc = -1; 1103 } 1104 else if (islower(c)) 1105 { 1106 c = toupper(c); 1107 cc = -1; 1108 } 1109 } 1110 *k++ = c; 1111 } 1112 else if (ew < COLL_KEY_MAX) 1113 { 1114 s = (char*)ep; 1115 if (ic) 1116 { 1117 c = mbchar(s); 1118 if (iswupper(c)) 1119 { 1120 c = towlower(c); 1121 cc = 1; 1122 } 1123 else if (iswlower(c)) 1124 { 1125 c = towupper(c); 1126 cc = 1; 1127 } 1128 } 1129 if (cc > 0) 1130 { 1131 cc = -1; 1132 k += mbconv((char*)k, c); 1133 } 1134 else 1135 for (e = k + ew; k < e; *k++ = *s++); 1136 } 1137 *k = 0; 1138 mbxfrm(ce->end, key, COLL_KEY_MAX); 1139 k = key; 1140 c = mbchar(k); 1141 if (iswupper(c)) 1142 et = COLL_range_uc; 1143 else if (iswlower(c)) 1144 et = COLL_range_lc; 1145 else 1146 et = COLL_range; 1147 ce->typ = bt == et ? bt : COLL_range; 1148 } 1149 else 1150 ce->typ = COLL_char; 1151 ce++; 1152 if (!ic || !cc) 1153 break; 1154 ic = 0; 1155 } 1156 return ce; 1157 } 1158 1159 static Rex_t* 1160 bra(Cenv_t* env) 1161 { 1162 Rex_t* e; 1163 int c; 1164 int i; 1165 int w; 1166 int neg; 1167 int last; 1168 int inrange; 1169 int complicated; 1170 int collate; 1171 int elements; 1172 unsigned char* first; 1173 unsigned char* start; 1174 unsigned char* begin; 1175 unsigned char* s; 1176 regclass_t f; 1177 unsigned char buf[4 * (COLL_KEY_MAX + 1)]; 1178 #if _PACKAGE_ast 1179 int ic; 1180 char mbc[COLL_KEY_MAX + 1]; 1181 #endif 1182 1183 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 1184 return 0; 1185 collate = complicated = elements = 0; 1186 if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!') 1187 { 1188 env->cursor++; 1189 neg = 1; 1190 } 1191 else 1192 neg = 0; 1193 first = env->cursor; 1194 start = first + MBSIZE(first); 1195 if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter)) 1196 goto error; 1197 begin = env->cursor + MBSIZE(env->cursor); 1198 1199 /* 1200 * inrange: 0=no, 1=possibly, 2=definitely 1201 */ 1202 1203 inrange = 0; 1204 for (;;) 1205 { 1206 if (!(c = *env->cursor) || c == env->terminator || c == env->delimiter && (env->flags & REG_ESCAPE)) 1207 goto error; 1208 env->cursor += (w = MBSIZE(env->cursor)); 1209 if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))) 1210 { 1211 if (*env->cursor) 1212 { 1213 if (*env->cursor == 'n') 1214 { 1215 env->cursor++; 1216 c = '\n'; 1217 } 1218 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 1219 { 1220 env->token.len = 1; 1221 w = magic(env, *env->cursor, 2); 1222 if (env->token.len > 1 || w != T_BAD) 1223 { 1224 if (env->token.len == 1 && (f = classfun(w))) 1225 { 1226 if (inrange > 1) 1227 { 1228 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1229 goto erange; 1230 inrange = 0; 1231 } 1232 env->cursor++; 1233 for (c = 0; c <= UCHAR_MAX; c++) 1234 if ((*f)(c)) 1235 setadd(e->re.charclass, c); 1236 complicated++; 1237 elements++; 1238 continue; 1239 } 1240 if (env->token.len > 1 || w >= 0 && w < T_META) 1241 { 1242 c = w; 1243 if (c > UCHAR_MAX) 1244 { 1245 if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide()) 1246 goto erange; 1247 c = UCHAR_MAX; 1248 } 1249 env->cursor += env->token.len; 1250 } 1251 } 1252 } 1253 } 1254 } 1255 else if (c == ']') 1256 { 1257 if (env->cursor == begin) 1258 { 1259 last = c; 1260 inrange = 1; 1261 continue; 1262 } 1263 if (inrange != 0) 1264 { 1265 setadd(e->re.charclass, last); 1266 elements++; 1267 if (inrange == 2) 1268 { 1269 setadd(e->re.charclass, '-'); 1270 elements++; 1271 } 1272 } 1273 break; 1274 } 1275 else if (c == '-') 1276 { 1277 if (!inrange && env->cursor != begin && *env->cursor != ']') 1278 { 1279 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1280 goto erange; 1281 continue; 1282 } 1283 else if (inrange == 1) 1284 { 1285 inrange = 2; 1286 complicated++; 1287 continue; 1288 } 1289 } 1290 else if (c == '[') 1291 { 1292 switch (*env->cursor) 1293 { 1294 case 0: 1295 goto error; 1296 case ':': 1297 if (env->regexp) 1298 goto normal; 1299 if (inrange == 1) 1300 { 1301 setadd(e->re.charclass, last); 1302 elements++; 1303 } 1304 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 1305 { 1306 if (env->cursor == start && (c = *(env->cursor + 1))) 1307 { 1308 s = start = env->cursor + 1; 1309 while (*++s && *s != ':'); 1310 if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']') 1311 { 1312 if ((i = (s - start)) == 1) 1313 { 1314 switch (c) 1315 { 1316 case '<': 1317 i = REX_WBEG; 1318 break; 1319 case '>': 1320 i = REX_WEND; 1321 break; 1322 default: 1323 i = 0; 1324 break; 1325 } 1326 if (i) 1327 { 1328 env->cursor = s + 3; 1329 drop(env->disc, e); 1330 return node(env, i, 0, 0, 0); 1331 } 1332 } 1333 } 1334 } 1335 env->error = REG_ECTYPE; 1336 goto error; 1337 } 1338 for (c = 0; c <= UCHAR_MAX; c++) 1339 if ((*f)(c)) 1340 setadd(e->re.charclass, c); 1341 inrange = 0; 1342 complicated++; 1343 elements++; 1344 continue; 1345 case '=': 1346 if (env->regexp) 1347 goto normal; 1348 if (inrange == 2) 1349 goto erange; 1350 if (inrange == 1) 1351 { 1352 setadd(e->re.charclass, last); 1353 elements++; 1354 } 1355 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 1356 goto ecollate; 1357 if (c > 1) 1358 collate++; 1359 else 1360 setadd(e->re.charclass, buf[0]); 1361 c = buf[0]; 1362 inrange = 0; 1363 complicated++; 1364 elements++; 1365 continue; 1366 case '.': 1367 if (env->regexp) 1368 goto normal; 1369 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0) 1370 goto ecollate; 1371 if (c > 1) 1372 collate++; 1373 c = buf[0]; 1374 complicated++; 1375 break; 1376 default: 1377 normal: 1378 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 1379 goto error; 1380 break; 1381 } 1382 } 1383 else if (w > 1) 1384 complicated++; 1385 if (inrange == 2) 1386 { 1387 if (last <= c) 1388 { 1389 for (i = last; i <= c; i++) 1390 setadd(e->re.charclass, i); 1391 inrange = env->type >= SRE || (env->flags & REG_LENIENT); 1392 elements += 2; 1393 } 1394 else if (env->type >= SRE) 1395 { 1396 setadd(e->re.charclass, last); 1397 setadd(e->re.charclass, c); 1398 elements += 2; 1399 inrange = 1; 1400 } 1401 else if (!(env->flags & REG_LENIENT)) 1402 goto erange; 1403 else 1404 inrange = 0; 1405 } 1406 else if (inrange == 1) 1407 { 1408 setadd(e->re.charclass, last); 1409 elements++; 1410 } 1411 else 1412 inrange = 1; 1413 last = c; 1414 } 1415 #if _PACKAGE_ast 1416 if (complicated && mbcoll()) 1417 { 1418 Dt_t* dt; 1419 Cchr_t* cc; 1420 Cchr_t* tc; 1421 Cchr_t* xc; 1422 Celt_t* ce; 1423 Cchr_t key; 1424 int rw; 1425 int rc; 1426 int wc; 1427 unsigned char* rp; 1428 unsigned char* pp; 1429 char* wp; 1430 char cb[2][COLL_KEY_MAX+1]; 1431 1432 static Dtdisc_t disc; 1433 1434 static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; 1435 1436 if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data)) 1437 { 1438 disc.key = offsetof(Cchr_t, key); 1439 if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dttree))) 1440 { 1441 for (i = 0; i < elementsof(primary) - 1; i++, cc++) 1442 { 1443 cc->nam[0] = primary[i]; 1444 mbxfrm(cc->key, cc->nam, COLL_KEY_MAX); 1445 dtinsert(dt, cc); 1446 } 1447 for (i = 0; i < elementsof(cc->key); i++) 1448 cc->key[i] = ~0; 1449 dtinsert(dt, cc); 1450 LCINFO(AST_LC_COLLATE)->data = (void*)dt; 1451 } 1452 else 1453 { 1454 if (cc) 1455 free(cc); 1456 drop(env->disc, e); 1457 return 0; 1458 } 1459 } 1460 if (dt) 1461 { 1462 drop(env->disc, e); 1463 if (ic = env->flags & REG_ICASE) 1464 elements *= 2; 1465 if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 2) * sizeof(Celt_t)))) 1466 return 0; 1467 ce = (Celt_t*)e->re.data; 1468 e->re.collate.invert = neg; 1469 e->re.collate.elements = ce; 1470 env->cursor = first; 1471 inrange = 0; 1472 for (;;) 1473 { 1474 if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 1475 goto error; 1476 pp = env->cursor; 1477 env->cursor += (w = MBSIZE(env->cursor)); 1478 if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))) 1479 { 1480 if (*env->cursor) 1481 { 1482 if (*env->cursor == 'n') 1483 { 1484 pp = env->cursor++; 1485 c = '\n'; 1486 } 1487 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 1488 { 1489 env->token.len = 1; 1490 w = magic(env, *env->cursor, 2); 1491 if (env->token.len > 1 || w != T_BAD) 1492 { 1493 if (env->token.len == 1 && (f = classfun(w))) 1494 { 1495 if (inrange > 1) 1496 { 1497 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1498 goto erange; 1499 inrange = 0; 1500 } 1501 env->cursor++; 1502 ce->fun = f; 1503 ce->typ = COLL_call; 1504 ce++; 1505 continue; 1506 } 1507 if (env->token.len > 1 || w >= 0 && w < T_META) 1508 { 1509 c = w; 1510 w = mbconv(mbc, c); 1511 pp = (unsigned char*)mbc; 1512 env->cursor += env->token.len; 1513 } 1514 } 1515 } 1516 } 1517 } 1518 else if (c == ']') 1519 { 1520 if (env->cursor == begin) 1521 { 1522 rp = pp; 1523 rw = w; 1524 inrange = 1; 1525 continue; 1526 } 1527 if (inrange != 0) 1528 { 1529 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1530 if (inrange == 2) 1531 ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0); 1532 } 1533 break; 1534 } 1535 else if (c == '-') 1536 { 1537 if (!inrange && env->cursor != begin && *env->cursor != ']') 1538 { 1539 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1540 goto erange; 1541 continue; 1542 } 1543 else if (inrange == 1) 1544 { 1545 inrange = 2; 1546 continue; 1547 } 1548 } 1549 else if (c == '[') 1550 { 1551 switch (*env->cursor) 1552 { 1553 case 0: 1554 goto error; 1555 case ':': 1556 if (env->regexp) 1557 goto complicated_normal; 1558 if (inrange == 1) 1559 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1560 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 1561 { 1562 if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']') 1563 { 1564 switch (c) 1565 { 1566 case '<': 1567 i = REX_WBEG; 1568 break; 1569 case '>': 1570 i = REX_WEND; 1571 break; 1572 default: 1573 i = 0; 1574 break; 1575 } 1576 if (i) 1577 { 1578 env->cursor += 5; 1579 drop(env->disc, e); 1580 return node(env, i, 0, 0, 0); 1581 } 1582 } 1583 env->error = REG_ECTYPE; 1584 goto error; 1585 } 1586 ce->fun = f; 1587 ce->typ = COLL_call; 1588 ce++; 1589 inrange = 0; 1590 continue; 1591 case '=': 1592 if (env->regexp) 1593 goto complicated_normal; 1594 if (inrange == 2) 1595 goto erange; 1596 if (inrange == 1) 1597 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1598 pp = (unsigned char*)cb[inrange]; 1599 rp = env->cursor + 1; 1600 if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) 1601 goto ecollate; 1602 wp = (char*)pp; 1603 wc = mbchar(wp); 1604 c = 0; 1605 if (ic) 1606 { 1607 if (iswupper(wc)) 1608 { 1609 wc = towlower(wc); 1610 rw = mbconv((char*)pp, wc); 1611 c = 'u'; 1612 } 1613 else if (iswlower(wc)) 1614 c = 'l'; 1615 } 1616 for (;;) 1617 { 1618 mbxfrm(key.key, (char*)pp, COLL_KEY_MAX); 1619 if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key))) 1620 goto ecollate; 1621 xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc; 1622 if (c == 'l' || c == 'L' && !(c = 0)) 1623 ce->typ = COLL_range_lc; 1624 else if (c == 'u' || c == 'U' && !(c = 0)) 1625 ce->typ = COLL_range_uc; 1626 else 1627 ce->typ = COLL_range; 1628 strcpy((char*)ce->beg, (char*)xc->key); 1629 if (!(cc = (Cchr_t*)dtnext(dt, cc))) 1630 goto ecollate; 1631 if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc))) 1632 cc = tc; 1633 strcpy((char*)ce->end, (char*)cc->key); 1634 ce->max = -1; 1635 ce++; 1636 if (!c) 1637 break; 1638 if (c == 'u') 1639 { 1640 wc = towlower(wc); 1641 c = 'L'; 1642 } 1643 else 1644 { 1645 wc = towupper(wc); 1646 c = 'U'; 1647 } 1648 rw = mbconv((char*)pp, wc); 1649 } 1650 inrange = 0; 1651 c = *pp; 1652 continue; 1653 case '.': 1654 if (env->regexp) 1655 goto complicated_normal; 1656 pp = (unsigned char*)cb[inrange]; 1657 if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0) 1658 goto ecollate; 1659 c = buf[0]; 1660 break; 1661 default: 1662 complicated_normal: 1663 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 1664 goto error; 1665 break; 1666 } 1667 } 1668 if (inrange == 2) 1669 { 1670 ce = col(ce, ic, rp, rw, rc, pp, w, c); 1671 if (strcmp((char*)ce->beg, (char*)ce->end) > 0) 1672 { 1673 if (env->type < SRE && !(env->flags & REG_LENIENT)) 1674 goto erange; 1675 (ce-1)->typ = COLL_char; 1676 strcpy((char*)ce->beg, (char*)(ce-1)->end); 1677 ce->typ = COLL_char; 1678 ce++; 1679 } 1680 inrange = env->type >= SRE || (env->flags & REG_LENIENT); 1681 } 1682 else if (inrange == 1) 1683 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1684 else 1685 inrange = 1; 1686 rp = pp; 1687 rw = w; 1688 rc = c; 1689 } 1690 ce->typ = COLL_end; 1691 return e; 1692 } 1693 } 1694 #endif 1695 if (collate) 1696 goto ecollate; 1697 if (env->flags & REG_ICASE) 1698 for (i = 0; i <= UCHAR_MAX; i++) 1699 if (settst(e->re.charclass, i)) 1700 { 1701 if (isupper(i)) 1702 c = tolower(i); 1703 else if (islower(i)) 1704 c = toupper(i); 1705 else 1706 continue; 1707 setadd(e->re.charclass, c); 1708 } 1709 if (neg) 1710 { 1711 for (i = 0; i < elementsof(e->re.charclass->bits); i++) 1712 e->re.charclass->bits[i] ^= ~0; 1713 if (env->explicit >= 0) 1714 setclr(e->re.charclass, env->explicit); 1715 } 1716 return e; 1717 ecollate: 1718 env->error = REG_ECOLLATE; 1719 goto error; 1720 erange: 1721 env->error = REG_ERANGE; 1722 error: 1723 drop(env->disc, e); 1724 if (!env->error) 1725 env->error = REG_EBRACK; 1726 return 0; 1727 } 1728 1729 static Rex_t* 1730 ccl(Cenv_t* env, int type) 1731 { 1732 int i; 1733 Rex_t* e; 1734 Celt_t* ce; 1735 regclass_t f; 1736 1737 if (!(f = classfun(type))) 1738 { 1739 env->error = REG_BADESC; 1740 return 0; 1741 } 1742 if (!mbcoll()) 1743 { 1744 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 1745 return 0; 1746 for (i = 0; i <= UCHAR_MAX; i++) 1747 if ((*f)(i)) 1748 setadd(e->re.charclass, i); 1749 if (env->explicit >= 0) 1750 setclr(e->re.charclass, env->explicit); 1751 } 1752 else 1753 { 1754 if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t)))) 1755 return 0; 1756 ce = (Celt_t*)e->re.data; 1757 e->re.collate.invert = 0; 1758 e->re.collate.elements = ce; 1759 ce->fun = f; 1760 ce->typ = COLL_call; 1761 ce++; 1762 ce->typ = COLL_end; 1763 } 1764 return e; 1765 } 1766 1767 static Rex_t* 1768 rep(Cenv_t* env, Rex_t* e, int number, int last) 1769 { 1770 Rex_t* f; 1771 unsigned long m = 0; 1772 unsigned long n = RE_DUP_INF; 1773 int minimal = -1; 1774 1775 if (!e) 1776 return 0; 1777 switch (token(env)) 1778 { 1779 case T_BANG: 1780 eat(env); 1781 if (!(f = node(env, REX_NEG, m, n, 0))) 1782 { 1783 drop(env->disc, e); 1784 return 0; 1785 } 1786 f->re.group.expr.rex = e; 1787 return f; 1788 case T_QUES: 1789 eat(env); 1790 n = 1; 1791 break; 1792 case T_STAR: 1793 eat(env); 1794 break; 1795 case T_PLUS: 1796 eat(env); 1797 m = 1; 1798 break; 1799 case T_LEFT: 1800 eat(env); 1801 m = env->token.min; 1802 n = env->token.max; 1803 break; 1804 default: 1805 return e; 1806 } 1807 if (env->token.att) 1808 minimal = 1; 1809 else if (env->type < SRE) 1810 switch (token(env)) 1811 { 1812 case T_QUES: 1813 eat(env); 1814 minimal = !(env->flags & REG_MINIMAL); 1815 break; 1816 case T_STAR: /*AST*/ 1817 eat(env); 1818 minimal = !!(env->flags & REG_MINIMAL); 1819 break; 1820 } 1821 switch (e->type) 1822 { 1823 case REX_DOT: 1824 case REX_CLASS: 1825 case REX_COLL_CLASS: 1826 case REX_ONECHAR: 1827 e->lo = m; 1828 e->hi = n; 1829 if (minimal >= 0) 1830 mark(e, minimal); 1831 return e; 1832 #if HUH_2002_08_07 1833 case REX_BEG: 1834 #endif 1835 case REX_BEG_STR: 1836 case REX_END_STR: 1837 case REX_FIN_STR: 1838 case REX_WBEG: 1839 case REX_WEND: 1840 case REX_WORD: 1841 case REX_WORD_NOT: 1842 env->error = REG_BADRPT; 1843 drop(env->disc, e); 1844 return 0; 1845 } 1846 if (m == 1 && n == 1) 1847 { 1848 if (minimal >= 0) 1849 mark(e, minimal); 1850 return e; 1851 } 1852 if (!(f = node(env, REX_REP, m, n, 0))) 1853 { 1854 drop(env->disc, e); 1855 return 0; 1856 } 1857 f->re.group.expr.rex = e; 1858 f->re.group.number = number; 1859 f->re.group.last = last; 1860 if (minimal >= 0) 1861 mark(f, minimal); 1862 if (m <= n && n) 1863 { 1864 for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex); 1865 if (e && e->type == REX_NEG) 1866 f->type = REX_GROUP; 1867 } 1868 return f; 1869 } 1870 1871 static int 1872 isstring(Rex_t* e) 1873 { 1874 switch (e->type) 1875 { 1876 case REX_ONECHAR: 1877 return e->lo == 1 && e->hi == 1; 1878 case REX_STRING: 1879 return 1; 1880 } 1881 return 0; 1882 } 1883 1884 static Trie_node_t* 1885 trienode(Cenv_t* env, int c) 1886 { 1887 Trie_node_t* t; 1888 1889 if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t))) 1890 { 1891 memset(t, 0, sizeof(Trie_node_t)); 1892 t->c = c; 1893 } 1894 return t; 1895 } 1896 1897 static int 1898 insert(Cenv_t* env, Rex_t* f, Rex_t* g) 1899 { 1900 unsigned char* s; 1901 unsigned char* e; 1902 Trie_node_t* t; 1903 int len; 1904 unsigned char tmp[2]; 1905 1906 switch (f->type) 1907 { 1908 case REX_ONECHAR: 1909 *(s = tmp) = f->re.onechar; 1910 e = s + 1; 1911 break; 1912 case REX_STRING: 1913 s = f->re.string.base; 1914 e = s + f->re.string.size; 1915 break; 1916 default: 1917 return 1; 1918 } 1919 if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s))) 1920 return 1; 1921 for (len = 1;;) 1922 { 1923 if (t->c == *s) 1924 { 1925 if (++s >= e) 1926 break; 1927 if (!t->son && !(t->son = trienode(env, *s))) 1928 return 1; 1929 t = t->son; 1930 len++; 1931 } 1932 else 1933 { 1934 if (!t->sib && !(t->sib = trienode(env, *s))) 1935 return 1; 1936 t = t->sib; 1937 } 1938 } 1939 if (g->re.trie.min > len) 1940 g->re.trie.min = len; 1941 if (g->re.trie.max < len) 1942 g->re.trie.max = len; 1943 t->end = 1; 1944 return 0; 1945 } 1946 1947 /* 1948 * trie() tries to combine nontrivial e and f into a REX_TRIE 1949 * unless 0 is returned, e and f are deleted as far as possible 1950 * also called by regcomb 1951 */ 1952 1953 static Rex_t* 1954 trie(Cenv_t* env, Rex_t* e, Rex_t* f) 1955 { 1956 Rex_t* g; 1957 1958 if (e->next || f->next || !isstring(e) || e->flags != f->flags) 1959 return 0; 1960 if (isstring(f)) 1961 { 1962 if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*)))) 1963 return 0; 1964 g->re.trie.min = INT_MAX; 1965 if (insert(env, f, g)) 1966 goto nospace; 1967 drop(env->disc, f); 1968 } 1969 else if (f->type != REX_TRIE) 1970 return 0; 1971 else 1972 g = f; 1973 if (insert(env, e, g)) 1974 goto nospace; 1975 drop(env->disc, e); 1976 return g; 1977 nospace: 1978 if (g != f) 1979 drop(env->disc, g); 1980 return 0; 1981 } 1982 1983 static Rex_t* alt(Cenv_t*, int, int); 1984 1985 static int 1986 chr(register Cenv_t* env, int* escaped) 1987 { 1988 unsigned char* p; 1989 int c; 1990 1991 *escaped = 0; 1992 if (!(c = *env->cursor)) 1993 return -1; 1994 env->cursor++; 1995 if (c == '\\') 1996 { 1997 if (env->flags & REG_SHELL_ESCAPED) 1998 return c; 1999 if (!(c = *(env->cursor + 1)) || c == env->terminator) 2000 { 2001 if (env->flags & REG_LENIENT) 2002 return c ? c : '\\'; 2003 env->error = REG_EESCAPE; 2004 return -1; 2005 } 2006 p = env->cursor; 2007 c = chresc((char*)env->cursor - 1, (char**)&env->cursor); 2008 *escaped = env->cursor - p; 2009 } 2010 return c; 2011 } 2012 2013 /* 2014 * open the perly gates 2015 */ 2016 2017 static Rex_t* 2018 grp(Cenv_t* env, int parno) 2019 { 2020 Rex_t* e; 2021 Rex_t* f; 2022 int c; 2023 int i; 2024 int n; 2025 int x; 2026 int esc; 2027 int typ; 2028 int beg; 2029 unsigned char* p; 2030 2031 beg = env->pattern == env->cursor - env->token.len; 2032 if (!(c = env->token.lex) && (c = *env->cursor)) 2033 env->cursor++; 2034 env->token.len = 0; 2035 env->parnest++; 2036 typ = -1; 2037 switch (c) 2038 { 2039 case '-': 2040 case '+': 2041 case 'a': 2042 case 'g': 2043 case 'i': 2044 case 'l': 2045 case 'm': 2046 case 'p': 2047 case 'r': 2048 case 's': 2049 case 'x': 2050 case 'A': 2051 case 'B': 2052 case 'E': 2053 case 'F': 2054 case 'G': 2055 case 'I': 2056 case 'K': 2057 case 'L': 2058 case 'M': /* glob(3) */ 2059 case 'N': /* glob(3) */ 2060 case 'O': /* glob(3) */ 2061 case 'P': 2062 case 'R': /* pcre */ 2063 case 'S': 2064 case 'U': /* pcre */ 2065 case 'X': /* pcre */ 2066 x = REX_GROUP; 2067 i = 1; 2068 env->token.push = 1; 2069 for (;;) 2070 { 2071 switch (c) 2072 { 2073 case ')': 2074 if (!(env->flags & REG_LITERAL)) 2075 { 2076 env->error = REG_BADRPT; 2077 return 0; 2078 } 2079 /*FALLTHROUGH*/ 2080 case 0: 2081 case T_CLOSE: 2082 x = 0; 2083 goto done; 2084 case ':': 2085 eat(env); 2086 if (token(env) == T_CLOSE) 2087 x = 0; 2088 goto done; 2089 case '-': 2090 i = 0; 2091 break; 2092 case '+': 2093 i = 1; 2094 break; 2095 case 'a': 2096 if (i) 2097 env->flags |= (REG_LEFT|REG_RIGHT); 2098 else 2099 env->flags &= ~(REG_LEFT|REG_RIGHT); 2100 break; 2101 case 'g': 2102 if (i) 2103 env->flags &= ~REG_MINIMAL; 2104 else 2105 env->flags |= REG_MINIMAL; 2106 break; 2107 case 'i': 2108 if (i) 2109 env->flags |= REG_ICASE; 2110 else 2111 env->flags &= ~REG_ICASE; 2112 break; 2113 case 'l': 2114 if (i) 2115 env->flags |= REG_LEFT; 2116 else 2117 env->flags &= ~REG_LEFT; 2118 break; 2119 case 'm': 2120 if (i) 2121 env->flags |= REG_NEWLINE; 2122 else 2123 env->flags &= ~REG_NEWLINE; 2124 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 2125 break; 2126 case 'p': 2127 if (i) 2128 env->flags &= ~REG_LENIENT; 2129 else 2130 env->flags |= REG_LENIENT; 2131 break; 2132 case 'r': 2133 if (i) 2134 env->flags |= REG_RIGHT; 2135 else 2136 env->flags &= ~REG_RIGHT; 2137 break; 2138 case 's': 2139 if (i) 2140 env->flags |= REG_SPAN; 2141 else 2142 env->flags &= ~REG_SPAN; 2143 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 2144 break; 2145 case 'x': 2146 if (i) 2147 env->flags |= REG_COMMENT; 2148 else 2149 env->flags &= ~REG_COMMENT; 2150 break; 2151 case 'A': 2152 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 2153 env->flags |= REG_AUGMENTED|REG_EXTENDED; 2154 typ = ARE; 2155 break; 2156 case 'B': 2157 case 'G': 2158 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 2159 typ = BRE; 2160 break; 2161 case 'E': 2162 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 2163 env->flags |= REG_EXTENDED; 2164 typ = ERE; 2165 break; 2166 case 'F': 2167 case 'L': 2168 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 2169 env->flags |= REG_LITERAL; 2170 typ = ERE; 2171 break; 2172 case 'K': 2173 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 2174 env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT; 2175 typ = KRE; 2176 break; 2177 case 'M': 2178 /* used by caller to disable glob(3) GLOB_BRACE */ 2179 break; 2180 case 'N': 2181 /* used by caller to disable glob(3) GLOB_NOCHECK */ 2182 break; 2183 case 'O': 2184 /* used by caller to disable glob(3) GLOB_STARSTAR */ 2185 break; 2186 case 'P': 2187 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 2188 env->flags |= REG_EXTENDED|REG_CLASS_ESCAPE; 2189 typ = ERE; 2190 break; 2191 case 'S': 2192 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE); 2193 env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT; 2194 typ = SRE; 2195 break; 2196 case 'U': /* PCRE_UNGREEDY */ 2197 if (i) 2198 env->flags |= REG_MINIMAL; 2199 else 2200 env->flags &= ~REG_MINIMAL; 2201 break; 2202 case 'X': /* PCRE_EXTRA */ 2203 break; 2204 default: 2205 env->error = REG_BADRPT; 2206 return 0; 2207 } 2208 eat(env); 2209 c = token(env); 2210 } 2211 done: 2212 break; 2213 case ':': 2214 x = REX_GROUP; 2215 break; 2216 case '=': 2217 x = REX_GROUP_AHEAD; 2218 break; 2219 case '!': 2220 x = REX_GROUP_AHEAD_NOT; 2221 break; 2222 case '<': 2223 switch (token(env)) 2224 { 2225 case '=': 2226 x = REX_GROUP_BEHIND; 2227 break; 2228 case '!': 2229 case T_BANG: 2230 x = REX_GROUP_BEHIND_NOT; 2231 break; 2232 default: 2233 env->error = REG_BADRPT; 2234 return 0; 2235 } 2236 eat(env); 2237 break; 2238 case '>': 2239 x = REX_GROUP_CUT; 2240 break; 2241 case '%': 2242 case T_PERCENT: 2243 e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short)); 2244 e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor; 2245 n = 1; 2246 for (;;) 2247 { 2248 switch (i = chr(env, &esc)) 2249 { 2250 case -1: 2251 case 0: 2252 invalid: 2253 env->cursor -= esc + 1; 2254 env->error = REG_EPAREN; 2255 return 0; 2256 case 'D': 2257 x = REX_NEST_delimiter; 2258 /*FALLTHROUGH*/ 2259 delimiter: 2260 if ((i = chr(env, &esc)) < 0) 2261 goto invalid; 2262 if (e->re.nest.type[i] & ~x) 2263 goto invalid; 2264 e->re.nest.type[i] = x; 2265 continue; 2266 case 'E': 2267 x = REX_NEST_escape; 2268 goto delimiter; 2269 case 'L': 2270 x = REX_NEST_literal; 2271 goto quote; 2272 case 'O': 2273 switch (i = chr(env, &esc)) 2274 { 2275 case 'T': 2276 e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator; 2277 break; 2278 default: 2279 goto invalid; 2280 } 2281 continue; 2282 case 'Q': 2283 x = REX_NEST_quote; 2284 /*FALLTHROUGH*/ 2285 quote: 2286 if ((i = chr(env, &esc)) < 0) 2287 goto invalid; 2288 if (e->re.nest.type[i] & ~x) 2289 goto invalid; 2290 e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT); 2291 continue; 2292 case 'S': 2293 x = REX_NEST_separator; 2294 goto delimiter; 2295 case 'T': 2296 x = REX_NEST_terminator; 2297 goto delimiter; 2298 case '|': 2299 case '&': 2300 if (!esc) 2301 goto invalid; 2302 goto nesting; 2303 case '(': 2304 if (!esc) 2305 n++; 2306 goto nesting; 2307 case ')': 2308 if (!esc && !--n) 2309 break; 2310 goto nesting; 2311 default: 2312 nesting: 2313 if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 2314 goto invalid; 2315 e->re.nest.type[i] = REX_NEST_open; 2316 if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 2317 goto invalid; 2318 if (!esc) 2319 { 2320 if (x == ')' && !--n) 2321 goto invalid; 2322 else if (x == '(') 2323 n++; 2324 } 2325 e->re.nest.type[x] |= REX_NEST_close; 2326 e->re.nest.type[i] |= x << REX_NEST_SHIFT; 2327 continue; 2328 } 2329 break; 2330 } 2331 env->parnest--; 2332 if (c == T_PERCENT) 2333 for (n = 0; n < 2; n++) 2334 { 2335 parno = ++env->parno; 2336 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2337 { 2338 drop(env->disc, e); 2339 return 0; 2340 } 2341 if (parno < elementsof(env->paren)) 2342 env->paren[parno] = f; 2343 f->re.group.back = 0; 2344 f->re.group.number = parno; 2345 f->re.group.expr.rex = e; 2346 e = f; 2347 } 2348 return e; 2349 case '(': 2350 c = 0; 2351 if (isdigit(*env->cursor)) 2352 { 2353 f = 0; 2354 do 2355 { 2356 if (c > (INT_MAX / 10)) 2357 { 2358 env->error = REG_BADRPT; 2359 return 0; 2360 } 2361 c = c * 10 + (*env->cursor++ - '0'); 2362 } while (isdigit(*env->cursor)); 2363 if (*env->cursor++ != ')') 2364 { 2365 env->error = REG_BADRPT; 2366 return 0; 2367 } 2368 if (c && env->type >= SRE) 2369 c = c * 2 - 1; 2370 if (!c || c > env->parno || !env->paren[c]) 2371 { 2372 if (!(env->flags & REG_LENIENT)) 2373 { 2374 env->error = REG_ESUBREG; 2375 return 0; 2376 } 2377 if (c) 2378 c = -1; 2379 } 2380 } 2381 else 2382 { 2383 if (env->type < SRE && *env->cursor++ != '?') 2384 { 2385 env->error = REG_BADRPT; 2386 return 0; 2387 } 2388 if (!(f = grp(env, parno + 1)) && env->error) 2389 return 0; 2390 } 2391 if (!(e = node(env, REX_GROUP_COND, 0, 0, 0))) 2392 { 2393 drop(env->disc, f); 2394 return 0; 2395 } 2396 e->re.group.size = c; 2397 e->re.group.expr.binary.left = f; 2398 if (!(e->re.group.expr.binary.right = alt(env, parno, 1))) 2399 { 2400 drop(env->disc, e); 2401 return 0; 2402 } 2403 if (token(env) != T_CLOSE) 2404 { 2405 env->error = REG_EPAREN; 2406 return 0; 2407 } 2408 eat(env); 2409 env->parnest--; 2410 return rep(env, e, parno, parno); 2411 case '{': 2412 p = env->cursor; 2413 n = 1; 2414 while (c = *env->cursor) 2415 { 2416 if (c == '\\' && *(env->cursor + 1)) 2417 env->cursor++; 2418 else if (c == '{') 2419 n++; 2420 else if (c == '}' && !--n) 2421 break; 2422 else if (c == env->delimiter || c == env->terminator) 2423 break; 2424 env->cursor++; 2425 } 2426 if (c != '}') 2427 { 2428 env->error = REG_EBRACE; 2429 return 0; 2430 } 2431 if (*++env->cursor != ')') 2432 { 2433 env->error = REG_EPAREN; 2434 return 0; 2435 } 2436 env->cursor++; 2437 env->parnest--; 2438 if (env->disc->re_version < REG_VERSION_EXEC) 2439 { 2440 env->error = REG_BADRPT; 2441 return 0; 2442 } 2443 if (!env->disc->re_execf) 2444 return 0; 2445 if (!(e = node(env, REX_EXEC, 0, 0, 0))) 2446 return 0; 2447 e->re.exec.text = (const char*)p; 2448 e->re.exec.size = env->cursor - p - 2; 2449 if (!env->disc->re_compf) 2450 e->re.exec.data = 0; 2451 else 2452 e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc); 2453 return e; 2454 case '0': case '1': case '2': case '3': case '4': 2455 case '5': case '6': case '7': case '8': case '9': 2456 c -= '0'; 2457 while (isdigit(*env->cursor)) 2458 { 2459 if (c > (INT_MAX / 10)) 2460 { 2461 env->error = REG_ESUBREG; 2462 return 0; 2463 } 2464 c = c * 10 + *env->cursor++ - '0'; 2465 } 2466 if (*env->cursor == ')') 2467 { 2468 env->cursor++; 2469 env->parnest--; 2470 env->token.len = 1; 2471 if (c > env->parno || !env->paren[c]) 2472 { 2473 env->error = REG_ESUBREG; 2474 return 0; 2475 } 2476 env->paren[c]->re.group.back = 1; 2477 return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 2478 } 2479 /*FALLTHROUGH*/ 2480 default: 2481 env->error = REG_BADRPT; 2482 return 0; 2483 } 2484 if (x && !(e = alt(env, parno, 0))) 2485 return 0; 2486 c = token(env); 2487 env->parnest--; 2488 if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')')) 2489 { 2490 env->error = REG_EPAREN; 2491 return 0; 2492 } 2493 eat(env); 2494 if (typ >= 0) 2495 { 2496 if (beg) 2497 env->pattern = env->cursor; 2498 env->type = typ; 2499 } 2500 if (!x) 2501 return 0; 2502 if (!(f = node(env, x, 0, 0, 0))) 2503 { 2504 drop(env->disc, e); 2505 return 0; 2506 } 2507 f->re.group.expr.rex = e; 2508 if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT) 2509 { 2510 if (stats(env, e)) 2511 { 2512 drop(env->disc, f); 2513 if (!env->error) 2514 env->error = REG_ECOUNT; 2515 return 0; 2516 } 2517 f->re.group.size = env->stats.m; 2518 memset(&env->stats, 0, sizeof(env->stats)); 2519 } 2520 switch (x) 2521 { 2522 case REX_GROUP: 2523 case REX_GROUP_CUT: 2524 f = rep(env, f, parno, env->parno); 2525 break; 2526 } 2527 return f; 2528 } 2529 2530 static Rex_t* 2531 seq(Cenv_t* env) 2532 { 2533 Rex_t* e; 2534 Rex_t* f; 2535 Token_t tok; 2536 int c; 2537 int i; 2538 int n; 2539 int x; 2540 int parno; 2541 int type; 2542 regflags_t flags; 2543 unsigned char* s; 2544 unsigned char* p; 2545 unsigned char* t; 2546 unsigned char* u; 2547 unsigned char buf[256]; 2548 2549 for (;;) 2550 { 2551 s = buf; 2552 while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len]) 2553 { 2554 x = c; 2555 p = env->cursor; 2556 if (c >= 0) 2557 { 2558 n = 1; 2559 *s++ = (env->flags & REG_ICASE) ? toupper(c) : c; 2560 } 2561 else if (c == C_ESC || (env->flags & REG_ICASE)) 2562 { 2563 c = (c == C_ESC) ? env->token.lex : mbchar(p); 2564 if (env->flags & REG_ICASE) 2565 c = towupper(c); 2566 if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX) 2567 break; 2568 if ((n = mbconv((char*)s, c)) < 0) 2569 *s++ = c; 2570 else if (n) 2571 s += n; 2572 else 2573 { 2574 n = 1; 2575 *s++ = 0; 2576 } 2577 } 2578 else 2579 { 2580 n = env->token.len - env->token.esc; 2581 for (t = p, u = s + n; s < u; *s++ = *t++); 2582 } 2583 eat(env); 2584 } 2585 if (c == T_BAD) 2586 return 0; 2587 if (s > buf) 2588 switch (c) 2589 { 2590 case T_STAR: 2591 case T_PLUS: 2592 case T_LEFT: 2593 case T_QUES: 2594 case T_BANG: 2595 if ((s -= n) == buf) 2596 e = 0; 2597 else 2598 { 2599 i = s - buf; 2600 if (!(e = node(env, REX_STRING, 0, 0, i))) 2601 return 0; 2602 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i); 2603 e->re.string.size = i; 2604 } 2605 if (x >= 0) 2606 { 2607 if (!(f = node(env, REX_ONECHAR, 1, 1, 0))) 2608 { 2609 drop(env->disc, e); 2610 return 0; 2611 } 2612 f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x; 2613 } 2614 else 2615 { 2616 if (!(f = node(env, REX_STRING, 0, 0, n))) 2617 return 0; 2618 memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n); 2619 f->re.string.size = n; 2620 } 2621 if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env)))) 2622 { 2623 drop(env->disc, e); 2624 return 0; 2625 } 2626 if (e) 2627 f = cat(env, e, f); 2628 return f; 2629 default: 2630 c = s - buf; 2631 if (!(e = node(env, REX_STRING, 0, 0, c))) 2632 return 0; 2633 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c); 2634 e->re.string.size = c; 2635 return cat(env, e, seq(env)); 2636 } 2637 else if (c > T_BACK) 2638 { 2639 eat(env); 2640 c -= T_BACK; 2641 if (c > env->parno || !env->paren[c]) 2642 { 2643 env->error = REG_ESUBREG; 2644 return 0; 2645 } 2646 env->paren[c]->re.group.back = 1; 2647 e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 2648 } 2649 else 2650 switch (c) 2651 { 2652 case T_AND: 2653 case T_CLOSE: 2654 case T_BAR: 2655 case T_END: 2656 return node(env, REX_NULL, 0, 0, 0); 2657 case T_DOLL: 2658 eat(env); 2659 e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0); 2660 break; 2661 case T_CFLX: 2662 eat(env); 2663 if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED)) 2664 e = rep(env, e, 0, 0); 2665 break; 2666 case T_OPEN: 2667 tok = env->token; 2668 eat(env); 2669 flags = env->flags; 2670 type = env->type; 2671 if (env->token.att) 2672 env->flags |= REG_MINIMAL; 2673 env->parnest++; 2674 if (env->type == KRE) 2675 ++env->parno; 2676 parno = ++env->parno; 2677 if (!(e = alt(env, parno + 1, 0))) 2678 break; 2679 if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL)) 2680 { 2681 drop(env->disc, e); 2682 env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL; 2683 return 0; 2684 } 2685 if (token(env) != T_CLOSE) 2686 { 2687 drop(env->disc, e); 2688 env->error = REG_EPAREN; 2689 return 0; 2690 } 2691 env->parnest--; 2692 eat(env); 2693 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2694 { 2695 drop(env->disc, e); 2696 return 0; 2697 } 2698 if (parno < elementsof(env->paren)) 2699 env->paren[parno] = f; 2700 f->re.group.back = 0; 2701 f->re.group.number = parno; 2702 f->re.group.expr.rex = e; 2703 if (tok.lex) 2704 { 2705 tok.push = 1; 2706 env->token = tok; 2707 } 2708 if (!(e = rep(env, f, parno, env->parno))) 2709 return 0; 2710 if (env->type == KRE) 2711 { 2712 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2713 { 2714 drop(env->disc, e); 2715 return 0; 2716 } 2717 if (--parno < elementsof(env->paren)) 2718 env->paren[parno] = f; 2719 f->re.group.back = 0; 2720 f->re.group.number = parno; 2721 f->re.group.expr.rex = e; 2722 e = f; 2723 } 2724 env->flags = flags; 2725 env->type = type; 2726 break; 2727 case T_GROUP: 2728 p = env->cursor; 2729 eat(env); 2730 flags = env->flags; 2731 type = env->type; 2732 if (!(e = grp(env, env->parno + 1))) 2733 { 2734 if (env->error) 2735 return 0; 2736 if (env->literal == env->pattern && env->literal == p) 2737 env->literal = env->cursor; 2738 continue; 2739 } 2740 env->flags = flags; 2741 env->type = type; 2742 break; 2743 case T_BRA: 2744 eat(env); 2745 if (e = bra(env)) 2746 e = rep(env, e, 0, 0); 2747 break; 2748 case T_ALNUM: 2749 case T_ALNUM_NOT: 2750 case T_DIGIT: 2751 case T_DIGIT_NOT: 2752 case T_SPACE: 2753 case T_SPACE_NOT: 2754 eat(env); 2755 if (e = ccl(env, c)) 2756 e = rep(env, e, 0, 0); 2757 break; 2758 case T_LT: 2759 eat(env); 2760 e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0); 2761 break; 2762 case T_GT: 2763 eat(env); 2764 e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0); 2765 break; 2766 case T_DOT: 2767 eat(env); 2768 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 2769 break; 2770 case T_DOTSTAR: 2771 eat(env); 2772 env->token.lex = T_STAR; 2773 env->token.push = 1; 2774 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 2775 break; 2776 case T_SLASHPLUS: 2777 eat(env); 2778 env->token.lex = T_PLUS; 2779 env->token.push = 1; 2780 if (e = node(env, REX_ONECHAR, 1, 1, 0)) 2781 { 2782 e->re.onechar = '/'; 2783 e = rep(env, e, 0, 0); 2784 } 2785 break; 2786 case T_WORD: 2787 eat(env); 2788 e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0); 2789 break; 2790 case T_WORD_NOT: 2791 eat(env); 2792 e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0); 2793 break; 2794 case T_BEG_STR: 2795 eat(env); 2796 e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0); 2797 break; 2798 case T_END_STR: 2799 eat(env); 2800 e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0); 2801 break; 2802 case T_FIN_STR: 2803 eat(env); 2804 e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0); 2805 break; 2806 default: 2807 env->error = REG_BADRPT; 2808 return 0; 2809 } 2810 if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator) 2811 e = cat(env, e, seq(env)); 2812 return e; 2813 } 2814 } 2815 2816 static Rex_t* 2817 con(Cenv_t* env) 2818 { 2819 Rex_t* e; 2820 Rex_t* f; 2821 Rex_t* g; 2822 2823 if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND) 2824 return e; 2825 eat(env); 2826 if (!(f = con(env))) 2827 { 2828 drop(env->disc, e); 2829 return 0; 2830 } 2831 if (!(g = node(env, REX_CONJ, 0, 0, 0))) 2832 { 2833 drop(env->disc, e); 2834 drop(env->disc, f); 2835 return 0; 2836 } 2837 g->re.group.expr.binary.left = e; 2838 g->re.group.expr.binary.right = f; 2839 return g; 2840 } 2841 2842 static Rex_t* 2843 alt(Cenv_t* env, int number, int cond) 2844 { 2845 Rex_t* e; 2846 Rex_t* f; 2847 Rex_t* g; 2848 2849 if (!(e = con(env))) 2850 return 0; 2851 else if (token(env) != T_BAR) 2852 { 2853 if (!cond) 2854 return e; 2855 f = 0; 2856 if (e->type == REX_NULL) 2857 goto bad; 2858 } 2859 else 2860 { 2861 eat(env); 2862 if (!(f = alt(env, number, 0))) 2863 { 2864 drop(env->disc, e); 2865 return 0; 2866 } 2867 if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL)) 2868 goto bad; 2869 if (!cond && (g = trie(env, e, f))) 2870 return g; 2871 } 2872 if (!(g = node(env, REX_ALT, 0, 0, 0))) 2873 { 2874 env->error = REG_ESPACE; 2875 goto bad; 2876 } 2877 g->re.group.number = number; 2878 g->re.group.last = env->parno; 2879 g->re.group.expr.binary.left = e; 2880 g->re.group.expr.binary.right = f; 2881 return g; 2882 bad: 2883 drop(env->disc, e); 2884 drop(env->disc, f); 2885 if (!env->error) 2886 env->error = REG_ENULL; 2887 return 0; 2888 } 2889 2890 /* 2891 * add v to REX_BM tables 2892 */ 2893 2894 static void 2895 bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b) 2896 { 2897 int c; 2898 int m; 2899 size_t z; 2900 2901 for (m = 0; m < n; m++) 2902 { 2903 if (!(z = n - m - 1)) 2904 z = HIT; 2905 c = v[m]; 2906 a->re.bm.mask[m][c] |= b; 2907 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 2908 a->re.bm.skip[c] = z; 2909 if (a->flags & REG_ICASE) 2910 { 2911 if (isupper(c)) 2912 c = tolower(c); 2913 else if (islower(c)) 2914 c = toupper(c); 2915 else 2916 continue; 2917 a->re.bm.mask[m][c] |= b; 2918 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 2919 a->re.bm.skip[c] = z; 2920 } 2921 } 2922 } 2923 2924 /* 2925 * set up BM table from trie 2926 */ 2927 2928 static int 2929 bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b) 2930 { 2931 do 2932 { 2933 v[m] = x->c; 2934 if (m >= (n - 1)) 2935 { 2936 bmstr(env, a, v, n, b); 2937 if (!(b <<= 1)) 2938 { 2939 b = 1; 2940 a->re.bm.complete = 0; 2941 } 2942 else if (x->son) 2943 a->re.bm.complete = 0; 2944 } 2945 else if (x->son) 2946 b = bmtrie(env, a, v, x->son, n, m + 1, b); 2947 } while (x = x->sib); 2948 return b; 2949 } 2950 2951 /* 2952 * rewrite the expression tree for some special cases 2953 * 1. it is a null expression - illegal 2954 * 2. max length fixed string found -- use BM algorithm 2955 * 3. it begins with an unanchored string - use KMP algorithm 2956 * 0 returned on success 2957 */ 2958 2959 static int 2960 special(Cenv_t* env, regex_t* p) 2961 { 2962 Rex_t* a; 2963 Rex_t* e; 2964 Rex_t* t; 2965 Rex_t* x; 2966 Rex_t* y; 2967 unsigned char* s; 2968 int* f; 2969 int n; 2970 int m; 2971 int k; 2972 2973 DEBUG_INIT(); 2974 if (e = p->env->rex) 2975 { 2976 if ((x = env->stats.x) && x->re.string.size < 3) 2977 x = 0; 2978 if ((t = env->stats.y) && t->re.trie.min < 3) 2979 t = 0; 2980 if (x && t) 2981 { 2982 if (x->re.string.size >= t->re.trie.min) 2983 t = 0; 2984 else 2985 x = 0; 2986 } 2987 if (x || t) 2988 { 2989 Bm_mask_t** mask; 2990 Bm_mask_t* h; 2991 unsigned char* v; 2992 size_t* q; 2993 unsigned long l; 2994 int i; 2995 int j; 2996 2997 if (x) 2998 { 2999 y = x; 3000 n = m = x->re.string.size; 3001 l = env->stats.l; 3002 } 3003 else 3004 { 3005 y = t; 3006 n = t->re.trie.min; 3007 m = t->re.trie.max; 3008 l = env->stats.k; 3009 } 3010 if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t)))) 3011 return 1; 3012 if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t)))) 3013 { 3014 alloc(env->disc, q, 0); 3015 return 1; 3016 } 3017 a->flags = y->flags; 3018 a->map = y->map; 3019 a->re.bm.size = n; 3020 a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1; 3021 a->re.bm.left = l - 1; 3022 a->re.bm.right = env->stats.m - l - n; 3023 a->re.bm.complete = (env->stats.e || y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n; 3024 h = (Bm_mask_t*)&a->re.bm.mask[n]; 3025 a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1)); 3026 a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1]; 3027 for (m = 0; m <= UCHAR_MAX; m++) 3028 a->re.bm.skip[m] = n; 3029 a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left); 3030 for (i = 1; i <= n; i++) 3031 a->re.bm.fail[i] = 2 * n - i; 3032 mask = a->re.bm.mask; 3033 for (m = 0; m < n; m++) 3034 { 3035 mask[m] = h; 3036 h += UCHAR_MAX + 1; 3037 } 3038 if (x) 3039 bmstr(env, a, x->re.string.base, n, 1); 3040 else 3041 { 3042 v = (unsigned char*)q; 3043 memset(v, 0, n); 3044 m = 1; 3045 for (i = 0; i <= UCHAR_MAX; i++) 3046 if (t->re.trie.root[i]) 3047 m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m); 3048 } 3049 mask--; 3050 memset(q, 0, n * sizeof(*q)); 3051 for (k = (j = n) + 1; j > 0; j--, k--) 3052 { 3053 DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0)); 3054 for (q[j] = k; k <= n; k = q[k]) 3055 { 3056 for (m = 0; m <= UCHAR_MAX; m++) 3057 if (mask[k][m] == mask[j][m]) 3058 { 3059 DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0)); 3060 goto cut; 3061 } 3062 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0)); 3063 if (a->re.bm.fail[k] > n - j) 3064 a->re.bm.fail[k] = n - j; 3065 } 3066 cut: ; 3067 } 3068 for (i = 1; i <= n; i++) 3069 if (a->re.bm.fail[i] > n + k - i) 3070 { 3071 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0)); 3072 a->re.bm.fail[i] = n + k - i; 3073 } 3074 #if _AST_REGEX_DEBUG 3075 if (DEBUG_TEST(0x0020,(1),(0))) 3076 { 3077 sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0); 3078 for (m = 0; m < n; m++) 3079 for (i = 1; i <= UCHAR_MAX; i++) 3080 if (a->re.bm.mask[m][i]) 3081 sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]); 3082 for (i = ' '; i <= UCHAR_MAX; i++) 3083 if (a->re.bm.skip[i] >= HIT) 3084 sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i); 3085 else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n) 3086 sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]); 3087 for (j = 31; j >= 0; j--) 3088 { 3089 sfprintf(sfstderr, " "); 3090 next: 3091 for (m = 0; m < n; m++) 3092 { 3093 for (i = 0040; i < 0177; i++) 3094 if (a->re.bm.mask[m][i] & (1 << j)) 3095 { 3096 sfprintf(sfstderr, " %c", i); 3097 break; 3098 } 3099 if (i >= 0177) 3100 { 3101 if (j > 0) 3102 { 3103 j--; 3104 goto next; 3105 } 3106 sfprintf(sfstderr, " ?"); 3107 } 3108 } 3109 sfprintf(sfstderr, "\n"); 3110 } 3111 sfprintf(sfstderr, "FAIL: "); 3112 for (m = 1; m <= n; m++) 3113 sfprintf(sfstderr, "%3d", a->re.bm.fail[m]); 3114 sfprintf(sfstderr, "\n"); 3115 } 3116 #endif 3117 alloc(env->disc, q, 0); 3118 a->next = e; 3119 p->env->rex = a; 3120 return 0; 3121 } 3122 switch (e->type) 3123 { 3124 case REX_BEG: 3125 if (env->flags & REG_NEWLINE) 3126 return 0; 3127 break; 3128 case REX_GROUP: 3129 if (env->stats.b) 3130 return 0; 3131 e = e->re.group.expr.rex; 3132 if (e->type != REX_DOT) 3133 return 0; 3134 /*FALLTHROUGH*/ 3135 case REX_DOT: 3136 if (e->lo == 0 && e->hi == RE_DUP_INF) 3137 break; 3138 return 0; 3139 case REX_NULL: 3140 if (env->flags & REG_NULL) 3141 break; 3142 env->error = REG_ENULL; 3143 return 1; 3144 case REX_STRING: 3145 if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map) 3146 return 0; 3147 s = e->re.string.base; 3148 n = e->re.string.size; 3149 if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1)))) 3150 return 1; 3151 a->flags = e->flags; 3152 a->map = e->map; 3153 f = a->re.string.fail; 3154 memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n); 3155 s = a->re.string.base; 3156 a->re.string.size = n; 3157 f[0] = m = -1; 3158 for (k = 1; k < n; k++) 3159 { 3160 while (m >= 0 && s[m+1] != s[k]) 3161 m = f[m]; 3162 if (s[m+1] == s[k]) 3163 m++; 3164 f[k] = m; 3165 } 3166 a->next = e->next; 3167 p->env->rex = a; 3168 e->next = 0; 3169 drop(env->disc, e); 3170 break; 3171 default: 3172 return 0; 3173 } 3174 } 3175 p->env->once = 1; 3176 return 0; 3177 } 3178 3179 int 3180 regcomp(regex_t* p, const char* pattern, regflags_t flags) 3181 { 3182 Rex_t* e; 3183 Rex_t* f; 3184 regdisc_t* disc; 3185 unsigned char* fold; 3186 int i; 3187 Cenv_t env; 3188 3189 if (!p) 3190 return REG_BADPAT; 3191 if (flags & REG_DISCIPLINE) 3192 { 3193 flags &= ~REG_DISCIPLINE; 3194 disc = p->re_disc; 3195 } 3196 else 3197 disc = &state.disc; 3198 if (!disc->re_errorlevel) 3199 disc->re_errorlevel = 2; 3200 p->env = 0; 3201 if (!pattern) 3202 return fatal(disc, REG_BADPAT, pattern); 3203 if (!state.initialized) 3204 { 3205 state.initialized = 1; 3206 for (i = 0; i < elementsof(state.escape); i++) 3207 state.magic[state.escape[i].key] = state.escape[i].val; 3208 } 3209 if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data)) 3210 { 3211 if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1))) 3212 return fatal(disc, REG_ESPACE, pattern); 3213 for (i = 0; i <= UCHAR_MAX; i++) 3214 fold[i] = toupper(i); 3215 LCINFO(AST_LC_CTYPE)->data = (void*)fold; 3216 } 3217 again: 3218 if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t)))) 3219 return fatal(disc, REG_ESPACE, pattern); 3220 memset(p->env, 0, sizeof(*p->env)); 3221 memset(&env, 0, sizeof(env)); 3222 env.regex = p; 3223 env.flags = flags; 3224 env.disc = p->env->disc = disc; 3225 if (env.flags & REG_AUGMENTED) 3226 env.flags |= REG_EXTENDED; 3227 env.mappeddot = '.'; 3228 env.mappednewline = '\n'; 3229 env.mappedslash = '/'; 3230 if (disc->re_version >= REG_VERSION_MAP && disc->re_map) 3231 { 3232 env.map = disc->re_map; 3233 env.MAP = p->env->fold; 3234 for (i = 0; i <= UCHAR_MAX; i++) 3235 { 3236 env.MAP[i] = fold[env.map[i]]; 3237 if (env.map[i] == '.') 3238 env.mappeddot = i; 3239 if (env.map[i] == '\n') 3240 env.mappednewline = i; 3241 if (env.map[i] == '/') 3242 env.mappedslash = i; 3243 } 3244 } 3245 else 3246 env.MAP = fold; 3247 env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE; 3248 env.explicit = -1; 3249 if (env.flags & REG_SHELL) 3250 { 3251 if (env.flags & REG_SHELL_PATH) 3252 env.explicit = env.mappedslash; 3253 env.flags |= REG_LENIENT|REG_NULL; 3254 env.type = env.type == BRE ? SRE : KRE; 3255 } 3256 if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE) 3257 env.explicit = env.mappednewline; 3258 p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1; 3259 env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL)); 3260 env.regexp = !!(env.flags & REG_REGEXP); 3261 env.token.lex = 0; 3262 env.token.push = 0; 3263 if (env.flags & REG_DELIMITED) 3264 { 3265 switch (env.delimiter = *pattern++) 3266 { 3267 case 0: 3268 case '\\': 3269 case '\n': 3270 case '\r': 3271 env.error = REG_EDELIM; 3272 goto bad; 3273 } 3274 env.terminator = '\n'; 3275 } 3276 env.literal = env.pattern = env.cursor = (unsigned char*)pattern; 3277 if (!(p->env->rex = alt(&env, 1, 0))) 3278 goto bad; 3279 if (env.parnest) 3280 { 3281 env.error = REG_EPAREN; 3282 goto bad; 3283 } 3284 p->env->stats.re_flags = env.flags & (REG_EXTENDED|REG_AUGMENTED|REG_SHELL); 3285 if (env.flags & REG_LEFT) 3286 { 3287 if (p->env->rex->type != REX_BEG) 3288 { 3289 if (p->env->rex->type == REX_ALT) 3290 env.flags &= ~REG_FIRST; 3291 if (!(e = node(&env, REX_BEG, 0, 0, 0))) 3292 { 3293 regfree(p); 3294 return fatal(disc, REG_ESPACE, pattern); 3295 } 3296 e->next = p->env->rex; 3297 p->env->rex = e; 3298 p->env->once = 1; 3299 } 3300 p->env->stats.re_flags |= REG_LEFT; 3301 } 3302 for (e = p->env->rex; e->next; e = e->next); 3303 p->env->done.type = REX_DONE; 3304 p->env->done.flags = e->flags; 3305 if (env.flags & REG_RIGHT) 3306 { 3307 if (e->type != REX_END) 3308 { 3309 if (p->env->rex->type == REX_ALT) 3310 env.flags &= ~REG_FIRST; 3311 if (!(f = node(&env, REX_END, 0, 0, 0))) 3312 { 3313 regfree(p); 3314 return fatal(disc, REG_ESPACE, pattern); 3315 } 3316 f->flags = e->flags; 3317 f->map = e->map; 3318 e->next = f; 3319 } 3320 p->env->stats.re_flags |= REG_RIGHT; 3321 } 3322 if (stats(&env, p->env->rex)) 3323 { 3324 if (!env.error) 3325 env.error = REG_ECOUNT; 3326 goto bad; 3327 } 3328 if (env.stats.b) 3329 p->env->hard = p->env->separate = 1; 3330 else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c))) 3331 p->env->hard = 1; 3332 if (p->env->hard || env.stats.c || env.stats.i) 3333 p->env->stats.re_min = p->env->stats.re_max = -1; 3334 else 3335 { 3336 if (!(p->env->stats.re_min = env.stats.m)) 3337 p->env->stats.re_min = -1; 3338 if (!(p->env->stats.re_max = env.stats.n)) 3339 p->env->stats.re_max = -1; 3340 } 3341 if (special(&env, p)) 3342 goto bad; 3343 serialize(&env, p->env->rex, 1); 3344 p->re_nsub = env.stats.p; 3345 if (env.type == KRE) 3346 p->re_nsub /= 2; 3347 if (env.flags & REG_DELIMITED) 3348 { 3349 p->re_npat = env.cursor - env.pattern + 1; 3350 if (*env.cursor == env.delimiter) 3351 p->re_npat++; 3352 else if (env.flags & REG_MUSTDELIM) 3353 { 3354 env.error = REG_EDELIM; 3355 goto bad; 3356 } 3357 else 3358 env.flags &= ~REG_DELIMITED; 3359 } 3360 p->env->explicit = env.explicit; 3361 p->env->flags = env.flags & REG_COMP; 3362 p->env->min = env.stats.m; 3363 p->env->nsub = env.stats.p + env.stats.u; 3364 p->env->refs = 1; 3365 return 0; 3366 bad: 3367 regfree(p); 3368 if (!env.error) 3369 env.error = REG_ESPACE; 3370 if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL)) 3371 { 3372 flags |= REG_LITERAL; 3373 pattern = (const char*)env.literal; 3374 goto again; 3375 } 3376 return fatal(disc, env.error, pattern); 3377 } 3378 3379 /* 3380 * regcomp() on sized pattern 3381 * the lazy way around adding and checking env.end 3382 */ 3383 3384 int 3385 regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags) 3386 { 3387 char* s; 3388 int r; 3389 3390 if (!(s = malloc(size + 1))) 3391 return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern); 3392 memcpy(s, pattern, size); 3393 s[size] = 0; 3394 r = regcomp(p, s, flags); 3395 free(s); 3396 return r; 3397 } 3398 3399 /* 3400 * combine two compiled regular expressions if possible, 3401 * replacing first with the combination and freeing second. 3402 * return 0 on success. 3403 * the only combinations handled are building a Trie 3404 * from String|Kmp|Trie and String|Kmp 3405 */ 3406 3407 int 3408 regcomb(regex_t* p, regex_t* q) 3409 { 3410 Rex_t* e = p->env->rex; 3411 Rex_t* f = q->env->rex; 3412 Rex_t* g; 3413 Rex_t* h; 3414 Cenv_t env; 3415 3416 if (!e || !f) 3417 return fatal(p->env->disc, REG_BADPAT, NiL); 3418 if (p->env->separate || q->env->separate) 3419 return REG_ESUBREG; 3420 memset(&env, 0, sizeof(env)); 3421 env.disc = p->env->disc; 3422 if (e->type == REX_BM) 3423 { 3424 p->env->rex = e->next; 3425 e->next = 0; 3426 drop(env.disc, e); 3427 e = p->env->rex; 3428 } 3429 if (f->type == REX_BM) 3430 { 3431 q->env->rex = f->next; 3432 f->next = 0; 3433 drop(env.disc, f); 3434 f = q->env->rex; 3435 } 3436 if (e->type == REX_BEG && f->type == REX_BEG) 3437 { 3438 p->env->flags |= REG_LEFT; 3439 p->env->rex = e->next; 3440 e->next = 0; 3441 drop(env.disc, e); 3442 e = p->env->rex; 3443 q->env->rex = f->next; 3444 f->next = 0; 3445 drop(env.disc, f); 3446 f = q->env->rex; 3447 } 3448 for (g = e; g->next; g = g->next); 3449 for (h = f; h->next; h = h->next); 3450 if (g->next && g->next->type == REX_END && h->next && h->next->type == REX_END) 3451 { 3452 p->env->flags |= REG_RIGHT; 3453 drop(env.disc, g->next); 3454 g->next = 0; 3455 drop(env.disc, h->next); 3456 h->next = 0; 3457 } 3458 if (!(g = trie(&env, f, e))) 3459 return fatal(p->env->disc, REG_BADPAT, NiL); 3460 p->env->rex = g; 3461 if (!q->env->once) 3462 p->env->once = 0; 3463 q->env->rex = 0; 3464 if (p->env->flags & REG_LEFT) 3465 { 3466 if (!(e = node(&env, REX_BEG, 0, 0, 0))) 3467 { 3468 regfree(p); 3469 return fatal(p->env->disc, REG_ESPACE, NiL); 3470 } 3471 e->next = p->env->rex; 3472 p->env->rex = e; 3473 p->env->once = 1; 3474 } 3475 if (p->env->flags & REG_RIGHT) 3476 { 3477 for (f = p->env->rex; f->next; f = f->next); 3478 if (f->type != REX_END) 3479 { 3480 if (!(e = node(&env, REX_END, 0, 0, 0))) 3481 { 3482 regfree(p); 3483 return fatal(p->env->disc, REG_ESPACE, NiL); 3484 } 3485 f->next = e; 3486 } 3487 } 3488 env.explicit = p->env->explicit; 3489 env.flags = p->env->flags; 3490 env.disc = p->env->disc; 3491 if (stats(&env, p->env->rex)) 3492 { 3493 regfree(p); 3494 return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL); 3495 } 3496 if (special(&env, p)) 3497 { 3498 regfree(p); 3499 return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL); 3500 } 3501 p->env->min = g->re.trie.min; 3502 return 0; 3503 } 3504 3505 /* 3506 * copy a reference of p into q 3507 * p and q may then have separate regsubcomp() instantiations 3508 */ 3509 3510 int 3511 regdup(regex_t* p, regex_t* q) 3512 { 3513 if (!p || !q) 3514 return REG_BADPAT; 3515 *q = *p; 3516 p->env->refs++; 3517 q->re_sub = 0; 3518 return 0; 3519 } 3520