1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2012 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Eclipse Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.eclipse.org/org/documents/epl-v10.html * 11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * posix regex compiler 26 */ 27 28 #include "reglib.h" 29 30 #if _PACKAGE_ast 31 #include "lclib.h" 32 #endif 33 34 #define serialize re_serialize /* hp.ia64 <unistd.h>! */ 35 36 #define C_ESC (-1) 37 #define C_MB (-2) 38 39 #if _AST_REGEX_DEBUG 40 41 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) 42 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) 43 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0) 44 45 static unsigned long debug; 46 static unsigned long debug_flag; 47 48 #else 49 50 #define DEBUG_INIT() 51 #define DEBUG_TEST(f,y,n) (n) 52 #define DEBUG_CODE(f,y,n) do {n} while(0) 53 54 #endif 55 56 #if _PACKAGE_ast 57 58 typedef struct Cchr_s 59 { 60 Dtlink_t lnk; 61 unsigned char nam[2]; 62 Ckey_t key; 63 } Cchr_t; 64 65 #endif 66 67 #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0) 68 69 /* 70 * determine whether greedy matching will work, i.e. produce 71 * the best match first. such expressions are "easy", and 72 * need no backtracking once a complete match is found. 73 * if an expression has backreferences or alts it's hard 74 * else if it has only one closure it's easy 75 * else if all closures are simple (i.e. one-character) it's easy 76 * else it's hard. 77 */ 78 79 typedef struct Stats_s 80 { 81 unsigned long l; /* min length to left of x */ 82 unsigned long k; /* min length to left of y */ 83 unsigned long m; /* min length */ 84 unsigned long n; /* max length */ 85 unsigned short a; /* number of alternations */ 86 unsigned short b; /* number of backrefs */ 87 unsigned short c; /* number of closures */ 88 unsigned short e; /* $ */ 89 unsigned short i; /* number of negations */ 90 unsigned short p; /* number of named subexpressions */ 91 unsigned short s; /* number of simple closures */ 92 unsigned short t; /* number of tries */ 93 unsigned short u; /* number of unnamed subexpressions */ 94 Rex_t* x; /* max length REX_STRING */ 95 Rex_t* y; /* max length REX_TRIE */ 96 } Stats_t; 97 98 typedef struct Token_s 99 { 100 unsigned long min; 101 unsigned long max; 102 short lex; 103 short len; 104 short esc; 105 short att; 106 short push; 107 } Token_t; 108 109 typedef struct Cenv_s 110 { 111 int delimiter; /* pattern delimiter */ 112 int error; /* last error */ 113 int explicit; /* explicit match on this char */ 114 int mappeddot; /* inverse mapped '.' */ 115 int mappednewline; /* inverse mapped '\n' */ 116 int mappedslash; /* inverse mapped '/' */ 117 regflags_t flags; /* flags arg to regcomp */ 118 int type; /* BRE,ERE,ARE,SRE,KRE */ 119 unsigned char* cursor; /* curent point in re */ 120 unsigned char* pattern; /* the original pattern */ 121 unsigned char* literal; /* literal restart pattern */ 122 int parno; /* number of last open paren */ 123 int parnest; /* paren nest count */ 124 int posixkludge; /* to make * nonspecial */ 125 Token_t token; /* token lookahead */ 126 Stats_t stats; /* RE statistics */ 127 int terminator; /* pattern terminator */ 128 Rex_t* paren[2*(BACK_REF_MAX+2)]; 129 /* paren[i]!=0 if \i defined */ 130 regex_t* regex; /* user handle */ 131 regdisc_t* disc; /* user discipline */ 132 unsigned char* map; /* external to native ccode map */ 133 unsigned char* MAP; /* fold and/or map */ 134 } Cenv_t; 135 136 /* 137 * allocate a new Rex_t node 138 */ 139 140 #if _BLD_DEBUG 141 #define node(a,b,c,d,e) node(a,b,c,d,e, const char* file, unsigned int line) 142 #endif 143 144 static Rex_t* 145 node(Cenv_t* env, int type, int lo, int hi, size_t extra) 146 { 147 register Rex_t* e; 148 149 DEBUG_TEST(0x0800,(sfprintf(sfstdout, "%s:%d node(%d,%d,%d,%u)\n", file, line, type, lo, hi, sizeof(Rex_t) + extra)),(0)); 150 if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra)) 151 { 152 memset(e, 0, sizeof(Rex_t) + extra); 153 e->type = type; 154 e->marked = 0; 155 e->lo = lo; 156 e->hi = hi; 157 e->flags = env->flags; 158 e->map = (e->flags & REG_ICASE) ? env->MAP : env->map; 159 e->explicit = env->explicit; 160 if (extra) 161 e->re.data = (char*)e + sizeof(Rex_t); 162 } 163 return e; 164 } 165 166 #if _BLD_DEBUG 167 #undef node 168 #define node(a,b,c,d,e) node(a,b,c,d,e,__FILE__,__LINE__) 169 #endif 170 171 /* 172 * free a Trie_node_t node 173 */ 174 175 static void 176 triedrop(regdisc_t* disc, Trie_node_t* e) 177 { 178 if (e) 179 { 180 triedrop(disc, e->son); 181 triedrop(disc, e->sib); 182 alloc(disc, e, 0); 183 } 184 } 185 186 /* 187 * free a Rex_t node 188 */ 189 190 void 191 drop(regdisc_t* disc, Rex_t* e) 192 { 193 int i; 194 Rex_t* x; 195 196 if (e && !(disc->re_flags & REG_NOFREE)) 197 do 198 { 199 switch (e->type) 200 { 201 case REX_ALT: 202 case REX_CONJ: 203 drop(disc, e->re.group.expr.binary.left); 204 drop(disc, e->re.group.expr.binary.right); 205 break; 206 case REX_GROUP: 207 case REX_GROUP_AHEAD: 208 case REX_GROUP_AHEAD_NOT: 209 case REX_GROUP_BEHIND: 210 case REX_GROUP_BEHIND_NOT: 211 case REX_GROUP_CUT: 212 case REX_NEG: 213 case REX_REP: 214 drop(disc, e->re.group.expr.rex); 215 break; 216 case REX_TRIE: 217 for (i = 0; i <= UCHAR_MAX; i++) 218 triedrop(disc, e->re.trie.root[i]); 219 break; 220 } 221 x = e->next; 222 alloc(disc, e, 0); 223 } while (e = x); 224 } 225 226 /* 227 * mark e and descendants minimal 228 */ 229 230 static void 231 mark(register Rex_t* e, int set) 232 { 233 if (e && !e->marked) 234 do 235 { 236 e->marked = 1; 237 if (set) 238 e->flags |= REG_MINIMAL; 239 else 240 e->flags &= ~REG_MINIMAL; 241 switch (e->type) 242 { 243 case REX_ALT: 244 case REX_CONJ: 245 case REX_GROUP_COND: 246 if (e->re.group.expr.binary.left) 247 mark(e->re.group.expr.binary.left, set); 248 if (e->re.group.expr.binary.right) 249 mark(e->re.group.expr.binary.right, set); 250 break; 251 case REX_GROUP: 252 case REX_GROUP_AHEAD: 253 case REX_GROUP_AHEAD_NOT: 254 case REX_GROUP_BEHIND: 255 case REX_GROUP_BEHIND_NOT: 256 case REX_GROUP_CUT: 257 case REX_NEG: 258 case REX_REP: 259 case REX_TRIE: 260 mark(e->re.group.expr.rex, set); 261 break; 262 } 263 } while (e = e->next); 264 } 265 266 /* 267 * assign subexpression numbers by a preorder tree walk 268 */ 269 270 static int 271 serialize(Cenv_t* env, Rex_t* e, int n) 272 { 273 do 274 { 275 e->serial = n++; 276 switch (e->type) 277 { 278 case REX_ALT: 279 case REX_GROUP_COND: 280 if (e->re.group.expr.binary.left) 281 n = serialize(env, e->re.group.expr.binary.left, n); 282 e->re.group.expr.binary.serial = n++; 283 if (e->re.group.expr.binary.right) 284 n = serialize(env, e->re.group.expr.binary.right, n); 285 break; 286 case REX_CONJ: 287 n = serialize(env, e->re.group.expr.binary.left, n); 288 n = serialize(env, e->re.group.expr.binary.right, n); 289 break; 290 case REX_GROUP: 291 case REX_GROUP_AHEAD: 292 case REX_GROUP_AHEAD_NOT: 293 case REX_GROUP_BEHIND: 294 case REX_GROUP_BEHIND_NOT: 295 case REX_GROUP_CUT: 296 case REX_NEG: 297 case REX_REP: 298 n = serialize(env, e->re.group.expr.rex, n); 299 break; 300 } 301 } while (e = e->next); 302 return n; 303 } 304 305 /* 306 * catenate e and f into a sequence, collapsing them if possible 307 */ 308 309 static Rex_t* 310 cat(Cenv_t* env, Rex_t* e, Rex_t* f) 311 { 312 Rex_t* g; 313 314 if (!f) 315 { 316 drop(env->disc, e); 317 return 0; 318 } 319 if (e->type == REX_NULL) 320 { 321 drop(env->disc, e); 322 return f; 323 } 324 if (f->type == REX_NULL) 325 { 326 g = f->next; 327 f->next = 0; 328 drop(env->disc, f); 329 f = g; 330 } 331 else if (e->type == REX_DOT && f->type == REX_DOT) 332 { 333 unsigned int m = e->lo + f->lo; 334 unsigned int n = e->hi + f->hi; 335 336 if (m <= RE_DUP_MAX) 337 { 338 if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX) 339 { 340 n = RE_DUP_INF; 341 goto combine; 342 } 343 else if (n <= RE_DUP_MAX) 344 { 345 combine: 346 e->lo = m; 347 e->hi = n; 348 g = f->next; 349 f->next = 0; 350 drop(env->disc, f); 351 f = g; 352 } 353 } 354 } 355 e->next = f; 356 return e; 357 } 358 359 /* 360 * collect re statistics 361 */ 362 363 static int 364 stats(register Cenv_t* env, register Rex_t* e) 365 { 366 register unsigned long n; 367 register unsigned long m; 368 unsigned long cm; 369 unsigned long nm; 370 unsigned long cn; 371 unsigned long nn; 372 unsigned long l; 373 unsigned long k; 374 unsigned long t; 375 Rex_t* q; 376 Rex_t* x; 377 Rex_t* y; 378 unsigned char c; 379 unsigned char b; 380 381 do 382 { 383 switch (e->type) 384 { 385 case REX_ALT: 386 x = env->stats.x; 387 l = env->stats.l; 388 y = env->stats.y; 389 k = env->stats.k; 390 t = env->stats.t; 391 if (++env->stats.a <= 0) 392 return 1; 393 cm = env->stats.m; 394 env->stats.m = 0; 395 cn = env->stats.n; 396 env->stats.n = 0; 397 if (stats(env, e->re.group.expr.binary.left)) 398 return 1; 399 m = env->stats.m; 400 env->stats.m = 0; 401 n = env->stats.n; 402 env->stats.n = 0; 403 if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right)) 404 return 1; 405 if (env->stats.m > m) 406 env->stats.m = m; 407 else 408 m = env->stats.m; 409 if ((env->stats.m += cm) < m) 410 return 1; 411 if (env->stats.n < n) 412 env->stats.n = n; 413 else 414 n = env->stats.n; 415 if ((env->stats.n += cn) < n) 416 return 1; 417 env->stats.x = x; 418 env->stats.l = l; 419 env->stats.y = y; 420 env->stats.k = k; 421 env->stats.t = t; 422 break; 423 case REX_BACK: 424 if (++env->stats.b <= 0) 425 return 1; 426 break; 427 case REX_CLASS: 428 case REX_COLL_CLASS: 429 case REX_DOT: 430 case REX_ONECHAR: 431 n = env->stats.m; 432 if ((env->stats.m += e->lo) < n) 433 return 1; 434 if (e->hi != RE_DUP_INF) 435 { 436 n = env->stats.n; 437 if ((env->stats.n += e->hi) < n) 438 return 1; 439 } 440 if (e->lo != e->hi) 441 { 442 if (++env->stats.c <= 0) 443 return 1; 444 if (++env->stats.s <= 0) 445 return 1; 446 } 447 break; 448 case REX_CONJ: 449 cm = env->stats.m; 450 env->stats.m = 0; 451 cn = env->stats.n; 452 env->stats.n = 0; 453 if (stats(env, e->re.group.expr.binary.left)) 454 return 1; 455 nm = env->stats.m; 456 env->stats.m = 0; 457 nn = env->stats.n; 458 env->stats.n = 0; 459 if (stats(env, e->re.group.expr.binary.right)) 460 return 1; 461 if (env->stats.m < nm) 462 env->stats.m = nm; 463 else 464 nm = env->stats.m; 465 if ((env->stats.m += cm) < nm) 466 return 1; 467 if (env->stats.n < nn) 468 env->stats.n = nn; 469 else 470 nn = env->stats.n; 471 if ((env->stats.n += cn) < nn) 472 return 1; 473 break; 474 case REX_END: 475 env->stats.e = 1; 476 break; 477 case REX_GROUP: 478 if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0) 479 return 1; 480 if (stats(env, e->re.group.expr.rex)) 481 return 1; 482 break; 483 case REX_GROUP_AHEAD: 484 case REX_GROUP_AHEAD_NOT: 485 case REX_GROUP_BEHIND: 486 case REX_GROUP_BEHIND_NOT: 487 m = env->stats.m; 488 n = env->stats.n; 489 x = env->stats.x; 490 y = env->stats.y; 491 if (stats(env, e->re.group.expr.rex)) 492 return 1; 493 env->stats.m = m; 494 env->stats.n = n; 495 env->stats.x = x; 496 env->stats.y = y; 497 switch (e->type) 498 { 499 case REX_GROUP_AHEAD: 500 case REX_GROUP_BEHIND: 501 if (++env->stats.u <= 0) 502 return 1; 503 break; 504 } 505 break; 506 case REX_GROUP_COND: 507 if (++env->stats.u <= 0) 508 return 1; 509 m = env->stats.m; 510 n = env->stats.n; 511 x = env->stats.x; 512 y = env->stats.y; 513 if (e->re.group.size > 0 && ++env->stats.b <= 0) 514 return 1; 515 if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left)) 516 return 1; 517 if (q = e->re.group.expr.binary.right) 518 { 519 if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left)) 520 return 1; 521 if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right)) 522 return 1; 523 } 524 env->stats.m = m; 525 env->stats.n = n; 526 env->stats.x = x; 527 env->stats.y = y; 528 break; 529 case REX_GROUP_CUT: 530 if (++env->stats.u <= 0) 531 return 1; 532 m = env->stats.m; 533 n = env->stats.n; 534 x = env->stats.x; 535 y = env->stats.y; 536 if (stats(env, e->re.group.expr.rex)) 537 return 1; 538 env->stats.m = m; 539 env->stats.n = n; 540 env->stats.x = x; 541 env->stats.y = y; 542 break; 543 case REX_NEG: 544 env->stats.i++; 545 x = env->stats.x; 546 l = env->stats.l; 547 y = env->stats.y; 548 k = env->stats.k; 549 t = env->stats.t; 550 cm = env->stats.m; 551 env->stats.m = 0; 552 if (stats(env, e->re.group.expr.rex)) 553 return 1; 554 env->stats.m = !env->stats.m; 555 if ((env->stats.m += cm) < cm) 556 return 1; 557 env->stats.x = x; 558 env->stats.l = l; 559 env->stats.y = y; 560 env->stats.k = k; 561 env->stats.t = t; 562 break; 563 case REX_REP: 564 x = env->stats.x; 565 l = env->stats.l; 566 y = env->stats.y; 567 k = env->stats.k; 568 t = env->stats.t; 569 if (++env->stats.c <= 0) 570 return 1; 571 b = env->stats.b; 572 c = env->stats.c; 573 cm = env->stats.m; 574 env->stats.m = 0; 575 if (stats(env, e->re.group.expr.rex)) 576 return 1; 577 if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0) 578 return 1; 579 if (e->lo < 1) 580 { 581 env->stats.x = x; 582 env->stats.l = l; 583 env->stats.y = y; 584 env->stats.k = k; 585 env->stats.t = t; 586 env->stats.m = cm; 587 } 588 else 589 { 590 m = env->stats.m; 591 if ((env->stats.m *= e->lo) > 0 && env->stats.m < m) 592 return 1; 593 m = env->stats.m; 594 if ((env->stats.m += cm) < m) 595 return 1; 596 if (env->stats.x != x) 597 env->stats.l = cm; 598 if (env->stats.y != y) 599 env->stats.k = cm; 600 } 601 break; 602 case REX_STRING: 603 if (!e->map) 604 { 605 cm = env->stats.m; 606 if ((env->stats.m += e->re.string.size) < cm) 607 return 1; 608 cn = env->stats.n; 609 if ((env->stats.n += e->re.string.size) < cn) 610 return 1; 611 if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size) 612 { 613 env->stats.x = e; 614 env->stats.l = cm; 615 } 616 } 617 break; 618 case REX_TRIE: 619 if (++env->stats.s <= 0) 620 return 1; 621 cm = env->stats.m; 622 if ((env->stats.m += e->re.trie.min) < cm) 623 return 1; 624 cn = env->stats.n; 625 if ((env->stats.n += e->re.trie.max) < cn) 626 return 1; 627 env->stats.t++; 628 if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min) 629 { 630 env->stats.y = e; 631 env->stats.k = cm; 632 } 633 break; 634 } 635 } while (e = e->next); 636 return 0; 637 } 638 639 static int token(Cenv_t*); 640 641 static int 642 magic(register Cenv_t* env, register int c, int escaped) 643 { 644 register char* sp; 645 register int n; 646 int o = c; 647 int e = env->error; 648 int l = env->token.len; 649 short* mp; 650 char* ep; 651 652 if (mp = state.magic[c]) 653 { 654 c = mp[env->type+escaped]; 655 if (c >= T_META) 656 { 657 sp = (char*)env->cursor + env->token.len; 658 switch (c) 659 { 660 case T_LEFT: 661 n = 0; 662 ep = sp; 663 while (*sp >= '0' && *sp <= '9') 664 { 665 if (n > (INT_MAX / 10)) 666 { 667 env->error = REG_BADBR; 668 goto bad; 669 } 670 n = n * 10 + *sp++ - '0'; 671 } 672 if (sp == ep) 673 { 674 if (env->type < SRE || *sp != ',') 675 { 676 env->error = *sp ? REG_BADBR : REG_EBRACE; 677 goto bad; 678 } 679 } 680 else if (n > RE_DUP_MAX) 681 { 682 env->error = REG_BADBR; 683 goto bad; 684 } 685 env->token.min = n; 686 if (*sp == ',') 687 { 688 n = 0; 689 ep = ++sp; 690 while (*sp >= '0' && *sp <= '9') 691 { 692 if (n > (INT_MAX / 10)) 693 { 694 env->error = REG_BADBR; 695 goto bad; 696 } 697 n = n * 10 + *sp++ - '0'; 698 } 699 if (sp == ep) 700 n = RE_DUP_INF; 701 else if (n < env->token.min) 702 { 703 env->error = REG_BADBR; 704 goto bad; 705 } 706 } 707 env->token.max = n; 708 switch (*sp) 709 { 710 case 0: 711 env->error = REG_EBRACE; 712 goto bad; 713 case '\\': 714 if (!escaped) 715 { 716 env->error = REG_BADBR; 717 goto bad; 718 } 719 sp++; 720 break; 721 default: 722 if (escaped) 723 { 724 env->error = REG_BADBR; 725 goto bad; 726 } 727 break; 728 } 729 switch (*sp++) 730 { 731 case 0: 732 env->error = REG_EBRACE; 733 goto bad; 734 case '}': 735 break; 736 default: 737 env->error = REG_BADBR; 738 goto bad; 739 } 740 env->token.len = sp - (char*)env->cursor; 741 break; 742 case T_RIGHT: 743 env->error = REG_EBRACE; 744 goto bad; 745 case T_OPEN: 746 if (env->type < SRE && *sp == '?') 747 { 748 env->token.len++; 749 env->token.lex = 0; 750 goto group; 751 } 752 break; 753 case T_ESCAPE: 754 c = chresc(sp - 2, &ep); 755 if (ep < sp) 756 goto bad; 757 env->token.len += ep - sp; 758 if (c >= T_META) 759 { 760 env->token.lex = c; 761 c = C_ESC; 762 } 763 return c; 764 case T_BACK+0: 765 case T_BACK+1: 766 case T_BACK+2: 767 case T_BACK+3: 768 case T_BACK+4: 769 case T_BACK+5: 770 case T_BACK+6: 771 case T_BACK+7: 772 n = chresc(sp - 2, &ep); 773 if (ep > sp + 1) 774 { 775 env->token.len += ep - sp; 776 return n; 777 } 778 /*FALLTHROUGH*/ 779 case T_BACK+8: 780 case T_BACK+9: 781 if (env->type == SRE || c == T_BACK && !(env->flags & (REG_LENIENT|REG_REGEXP))) 782 { 783 env->error = REG_BADESC; 784 goto bad; 785 } 786 if ((env->flags & REG_MULTIREF) && isdigit(*sp)) 787 { 788 c = (c - T_BACK) * 10 + (*sp - '0'); 789 if (c > 0 && c <= env->parno && env->paren[c]) 790 c += T_BACK; 791 else 792 c = chresc(sp - 2, &ep); 793 env->token.len++; 794 } 795 if (c == T_BACK) 796 c = 0; 797 break; 798 case T_BAD: 799 if (escaped == 1 && (env->flags & (REG_LENIENT|REG_REGEXP)) && (c = mp[env->type+escaped+2]) >= T_META) 800 return c; 801 goto bad; 802 } 803 if (env->type >= SRE) 804 { 805 if (c == T_DOT) 806 c = '.'; 807 else if (c < T_OPEN) 808 { 809 if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(') 810 { 811 env->token.len++; 812 env->token.att = 1; 813 } 814 if (env->type == KRE && *(env->cursor + env->token.len) == '(') 815 { 816 env->token.len++; 817 switch (c) 818 { 819 case T_AT: 820 break; 821 case T_PERCENT: 822 env->token.lex = c; 823 goto group; 824 case T_TILDE: 825 env->token.lex = 0; 826 goto group; 827 default: 828 env->token.lex = c; 829 break; 830 } 831 c = T_OPEN; 832 } 833 else if (c == T_STAR) 834 c = T_DOTSTAR; 835 else if (c == T_QUES) 836 c = T_DOT; 837 else 838 { 839 c = o; 840 env->token.len = l; 841 } 842 } 843 else if (c > T_BACK) 844 { 845 c = (c - T_BACK) * 2 - 1; 846 c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c; 847 } 848 else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP)) 849 { 850 if (c == T_AND) 851 c = '&'; 852 else if (c == T_BAR) 853 c = '|'; 854 else if (c == T_OPEN) 855 c = '('; 856 } 857 } 858 } 859 } 860 else if (escaped == 2) 861 { 862 if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter)) 863 /*ok*/; 864 else 865 { 866 env->error = REG_BADESC; 867 goto bad; 868 } 869 } 870 else if (escaped && !(env->flags & (REG_LENIENT|REG_REGEXP)) && c != ']') 871 { 872 env->error = REG_BADESC; 873 goto bad; 874 } 875 return c; 876 group: 877 sp = (char*)env->cursor + env->token.len; 878 switch (*sp++) 879 { 880 case ')': 881 break; 882 case '#': 883 for (;;) 884 { 885 switch (*sp++) 886 { 887 case 0: 888 env->error = REG_EPAREN; 889 return T_BAD; 890 case ')': 891 break; 892 default: 893 continue; 894 } 895 break; 896 } 897 break; 898 default: 899 return T_GROUP; 900 } 901 env->cursor = (unsigned char*)sp; 902 return token(env); 903 bad: 904 if (escaped == 2) 905 env->error = e; 906 else if (env->flags & (REG_LENIENT|REG_REGEXP)) 907 return o; 908 else if (escaped == 1 && !env->error) 909 { 910 if (mp || o == ']') 911 return o; 912 env->error = REG_BADESC; 913 } 914 return T_BAD; 915 } 916 917 static int 918 token(register Cenv_t* env) 919 { 920 int c; 921 int posixkludge; 922 923 if (env->token.push) 924 return env->token.lex; 925 env->token.att = env->token.esc = 0; 926 if ((env->token.len = MBSIZE(env->cursor)) > 1) 927 return env->token.lex = C_MB; 928 env->token.lex = 0; 929 for (;;) 930 { 931 c = *env->cursor; 932 if (c == 0 || c == env->delimiter || c == env->terminator) 933 return T_END; 934 if (!(env->flags & REG_COMMENT)) 935 break; 936 if (c == '#') 937 { 938 do 939 { 940 c = *++env->cursor; 941 if (c == 0 || c == env->delimiter) 942 return T_END; 943 } while (c != '\n'); 944 } 945 else if (!isspace(c)) 946 break; 947 env->cursor++; 948 } 949 if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter) 950 { 951 if (env->parnest) 952 { 953 env->error = REG_EPAREN; 954 return T_BAD; 955 } 956 env->parno = 0; 957 env->pattern = env->cursor + 1; 958 return T_BAR; 959 } 960 if (env->flags & REG_LITERAL) 961 return c; 962 if (posixkludge = env->posixkludge) 963 { 964 env->posixkludge = 0; 965 if (c == '*') 966 return c; 967 } 968 if (c == '\\') 969 { 970 if (env->flags & REG_SHELL_ESCAPED) 971 return c; 972 if (!(c = *(env->cursor + 1)) || c == env->terminator) 973 { 974 if (env->flags & (REG_LENIENT|REG_REGEXP)) 975 { 976 if (c) 977 { 978 env->token.esc = env->token.len; 979 env->token.len += MBSIZE(env->cursor + 1); 980 return c; 981 } 982 return '\\'; 983 } 984 env->error = REG_EESCAPE; 985 return T_BAD; 986 } 987 env->token.esc = env->token.len; 988 env->token.len += MBSIZE(env->cursor + 1); 989 if (env->delimiter && c == 'n') 990 return '\n'; 991 else if (c == env->delimiter) 992 return magic(env, c, 0); 993 else if (c == '(' && env->type == BRE) 994 env->posixkludge = 1; 995 else if (c == ')' && env->type == BRE && env->parnest <= 0) 996 { 997 env->error = REG_EPAREN; 998 return T_BAD; 999 } 1000 else if (isspace(c) && (env->flags & REG_COMMENT)) 1001 return c; 1002 return magic(env, c, 1); 1003 } 1004 else if (c == '$') 1005 { 1006 if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n') 1007 return T_DOLL; 1008 } 1009 else if (c == '^') 1010 { 1011 if (env->type == BRE && (env->cursor == env->pattern || posixkludge == 1)) 1012 { 1013 env->posixkludge = 2; 1014 return T_CFLX; 1015 } 1016 } 1017 else if (c == ')') 1018 { 1019 if (env->type != BRE && env->parnest <= 0) 1020 return c; 1021 } 1022 else if (c == '/' && env->explicit == env->mappedslash) 1023 { 1024 while (*(env->cursor + env->token.len) == c) 1025 env->token.len++; 1026 return T_SLASHPLUS; 1027 } 1028 return magic(env, c, 0); 1029 } 1030 1031 static Celt_t* 1032 col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec) 1033 { 1034 register char* s; 1035 register unsigned char* k; 1036 register unsigned char* e; 1037 register int c; 1038 register int cc; 1039 int bt; 1040 int et; 1041 Ckey_t key; 1042 1043 cc = 0; 1044 for (;;) 1045 { 1046 k = key; 1047 if (bw == 1) 1048 { 1049 c = bc; 1050 if (ic) 1051 { 1052 if (isupper(c)) 1053 { 1054 c = tolower(c); 1055 cc = -1; 1056 } 1057 else if (islower(c)) 1058 { 1059 c = toupper(c); 1060 cc = -1; 1061 } 1062 } 1063 *k++ = c; 1064 } 1065 else if (bw < COLL_KEY_MAX) 1066 { 1067 s = (char*)bp; 1068 if (ic) 1069 { 1070 c = mbchar(s); 1071 if (iswupper(c)) 1072 { 1073 c = towlower(c); 1074 cc = 1; 1075 } 1076 else if (iswlower(c)) 1077 { 1078 c = towupper(c); 1079 cc = 1; 1080 } 1081 } 1082 if (cc > 0) 1083 { 1084 cc = -1; 1085 k += mbconv((char*)k, c); 1086 } 1087 else 1088 for (e = k + bw; k < e; *k++ = *s++); 1089 } 1090 *k = 0; 1091 mbxfrm(ce->beg, key, COLL_KEY_MAX); 1092 if (ep) 1093 { 1094 k = key; 1095 c = mbchar(k); 1096 if (iswupper(c)) 1097 bt = COLL_range_uc; 1098 else if (iswlower(c)) 1099 bt = COLL_range_lc; 1100 else 1101 bt = COLL_range; 1102 k = key; 1103 if (ew == 1) 1104 { 1105 c = ec; 1106 if (ic) 1107 { 1108 if (isupper(c)) 1109 { 1110 c = tolower(c); 1111 cc = -1; 1112 } 1113 else if (islower(c)) 1114 { 1115 c = toupper(c); 1116 cc = -1; 1117 } 1118 } 1119 *k++ = c; 1120 } 1121 else if (ew < COLL_KEY_MAX) 1122 { 1123 s = (char*)ep; 1124 if (ic) 1125 { 1126 c = mbchar(s); 1127 if (iswupper(c)) 1128 { 1129 c = towlower(c); 1130 cc = 1; 1131 } 1132 else if (iswlower(c)) 1133 { 1134 c = towupper(c); 1135 cc = 1; 1136 } 1137 } 1138 if (cc > 0) 1139 { 1140 cc = -1; 1141 k += mbconv((char*)k, c); 1142 } 1143 else 1144 for (e = k + ew; k < e; *k++ = *s++); 1145 } 1146 *k = 0; 1147 mbxfrm(ce->end, key, COLL_KEY_MAX); 1148 k = key; 1149 c = mbchar(k); 1150 if (iswupper(c)) 1151 et = COLL_range_uc; 1152 else if (iswlower(c)) 1153 et = COLL_range_lc; 1154 else 1155 et = COLL_range; 1156 ce->typ = bt == et ? bt : COLL_range; 1157 } 1158 else 1159 ce->typ = COLL_char; 1160 ce++; 1161 if (!ic || !cc) 1162 break; 1163 ic = 0; 1164 } 1165 return ce; 1166 } 1167 1168 static Rex_t* 1169 bra(Cenv_t* env) 1170 { 1171 Rex_t* e; 1172 int c; 1173 int i; 1174 int w; 1175 int neg; 1176 int last; 1177 int inrange; 1178 int complicated; 1179 int collate; 1180 int elements; 1181 unsigned char* first; 1182 unsigned char* start; 1183 unsigned char* begin; 1184 unsigned char* s; 1185 regclass_t f; 1186 unsigned char buf[4 * (COLL_KEY_MAX + 1)]; 1187 #if _PACKAGE_ast 1188 int ic; 1189 char mbc[COLL_KEY_MAX + 1]; 1190 #endif 1191 1192 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 1193 return 0; 1194 collate = complicated = elements = 0; 1195 if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!') 1196 { 1197 env->cursor++; 1198 complicated = neg = 1; 1199 } 1200 else 1201 neg = 0; 1202 first = env->cursor; 1203 start = first + MBSIZE(first); 1204 if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter)) 1205 goto error; 1206 begin = env->cursor + MBSIZE(env->cursor); 1207 1208 /* 1209 * inrange: 0=no, 1=possibly, 2=definitely 1210 */ 1211 1212 inrange = 0; 1213 for (;;) 1214 { 1215 if (!(c = *env->cursor) || c == env->terminator || c == env->delimiter && (env->flags & REG_ESCAPE)) 1216 goto error; 1217 env->cursor += (w = MBSIZE(env->cursor)); 1218 if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))) 1219 { 1220 if (*env->cursor) 1221 { 1222 if (*env->cursor == 'n') 1223 { 1224 env->cursor++; 1225 c = '\n'; 1226 } 1227 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 1228 { 1229 env->token.len = 1; 1230 w = magic(env, *env->cursor, 2); 1231 if (env->token.len > 1 || w != T_BAD) 1232 { 1233 if (env->token.len == 1 && (f = classfun(w))) 1234 { 1235 if (inrange > 1) 1236 { 1237 if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP))) 1238 goto erange; 1239 inrange = 0; 1240 } 1241 env->cursor++; 1242 for (c = 0; c <= UCHAR_MAX; c++) 1243 if ((*f)(c)) 1244 setadd(e->re.charclass, c); 1245 complicated++; 1246 elements++; 1247 continue; 1248 } 1249 if (env->token.len > 1 || w >= 0 && w < T_META) 1250 { 1251 c = w; 1252 if (c > UCHAR_MAX) 1253 { 1254 if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)) && !mbwide()) 1255 goto erange; 1256 c = UCHAR_MAX; 1257 } 1258 env->cursor += env->token.len; 1259 } 1260 } 1261 } 1262 } 1263 } 1264 else if (c == ']') 1265 { 1266 if (env->cursor == begin) 1267 { 1268 last = c; 1269 inrange = 1; 1270 continue; 1271 } 1272 if (inrange != 0) 1273 { 1274 setadd(e->re.charclass, last); 1275 elements++; 1276 if (inrange == 2) 1277 { 1278 setadd(e->re.charclass, '-'); 1279 elements++; 1280 } 1281 } 1282 break; 1283 } 1284 else if (c == '-') 1285 { 1286 if (!inrange && env->cursor != begin && *env->cursor != ']') 1287 { 1288 if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP))) 1289 goto erange; 1290 continue; 1291 } 1292 else if (inrange == 1) 1293 { 1294 inrange = 2; 1295 complicated++; 1296 continue; 1297 } 1298 } 1299 else if (c == '[') 1300 { 1301 switch (*env->cursor) 1302 { 1303 case 0: 1304 goto error; 1305 case ':': 1306 if (env->flags & REG_REGEXP) 1307 goto normal; 1308 if (inrange == 1) 1309 { 1310 setadd(e->re.charclass, last); 1311 elements++; 1312 } 1313 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 1314 { 1315 if (env->cursor == start && (c = *(env->cursor + 1))) 1316 { 1317 s = start = env->cursor + 1; 1318 while (*++s && *s != ':'); 1319 if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']') 1320 { 1321 if ((i = (s - start)) == 1) 1322 { 1323 switch (c) 1324 { 1325 case '<': 1326 i = REX_WBEG; 1327 break; 1328 case '>': 1329 i = REX_WEND; 1330 break; 1331 default: 1332 i = 0; 1333 break; 1334 } 1335 if (i) 1336 { 1337 env->cursor = s + 3; 1338 drop(env->disc, e); 1339 return node(env, i, 0, 0, 0); 1340 } 1341 } 1342 } 1343 } 1344 env->error = REG_ECTYPE; 1345 goto error; 1346 } 1347 for (c = 0; c <= UCHAR_MAX; c++) 1348 if ((*f)(c)) 1349 setadd(e->re.charclass, c); 1350 inrange = 0; 1351 complicated++; 1352 elements++; 1353 continue; 1354 case '=': 1355 if (env->flags & REG_REGEXP) 1356 goto normal; 1357 if (inrange == 2) 1358 goto erange; 1359 if (inrange == 1) 1360 { 1361 setadd(e->re.charclass, last); 1362 elements++; 1363 } 1364 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf), NiL)) < 0) 1365 goto ecollate; 1366 if (c > 1) 1367 collate++; 1368 else 1369 setadd(e->re.charclass, buf[0]); 1370 c = buf[0]; 1371 inrange = 0; 1372 complicated++; 1373 elements++; 1374 continue; 1375 case '.': 1376 if (env->flags & REG_REGEXP) 1377 goto normal; 1378 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf), NiL)) < 0) 1379 goto ecollate; 1380 if (c > 1) 1381 collate++; 1382 c = buf[0]; 1383 complicated++; 1384 break; 1385 default: 1386 normal: 1387 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 1388 goto error; 1389 break; 1390 } 1391 } 1392 else if (w > 1) 1393 complicated++; 1394 if (inrange == 2) 1395 { 1396 if (last <= c) 1397 { 1398 for (i = last; i <= c; i++) 1399 setadd(e->re.charclass, i); 1400 inrange = env->type >= SRE || (env->flags & (REG_LENIENT|REG_REGEXP)); 1401 elements += 2; 1402 } 1403 else if (env->type >= SRE) 1404 { 1405 setadd(e->re.charclass, last); 1406 setadd(e->re.charclass, c); 1407 elements += 2; 1408 inrange = 1; 1409 } 1410 else if (!(env->flags & (REG_LENIENT|REG_REGEXP))) 1411 goto erange; 1412 else 1413 inrange = 0; 1414 } 1415 else if (inrange == 1) 1416 { 1417 setadd(e->re.charclass, last); 1418 elements++; 1419 } 1420 else 1421 inrange = 1; 1422 last = c; 1423 } 1424 #if _PACKAGE_ast 1425 if (complicated && mbcoll()) 1426 { 1427 Dt_t* dt; 1428 Cchr_t* cc; 1429 Cchr_t* tc; 1430 Cchr_t* xc; 1431 Celt_t* ce; 1432 Cchr_t key; 1433 int rw; 1434 int rc; 1435 wchar_t wc; 1436 unsigned char* rp; 1437 unsigned char* pp; 1438 char cb[2][COLL_KEY_MAX+1]; 1439 1440 static Dtdisc_t disc; 1441 1442 static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; 1443 1444 if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data)) 1445 { 1446 disc.key = offsetof(Cchr_t, key); 1447 if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dtoset))) 1448 { 1449 for (i = 0; i < elementsof(primary) - 1; i++, cc++) 1450 { 1451 cc->nam[0] = primary[i]; 1452 mbxfrm(cc->key, cc->nam, COLL_KEY_MAX); 1453 dtinsert(dt, cc); 1454 } 1455 for (i = 0; i < elementsof(cc->key); i++) 1456 cc->key[i] = ~0; 1457 dtinsert(dt, cc); 1458 LCINFO(AST_LC_COLLATE)->data = (void*)dt; 1459 } 1460 else 1461 { 1462 if (cc) 1463 free(cc); 1464 drop(env->disc, e); 1465 return 0; 1466 } 1467 } 1468 if (dt) 1469 { 1470 drop(env->disc, e); 1471 if (ic = env->flags & REG_ICASE) 1472 elements *= 2; 1473 if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 3) * sizeof(Celt_t)))) 1474 return 0; 1475 ce = (Celt_t*)e->re.data; 1476 e->re.collate.invert = neg; 1477 e->re.collate.elements = ce; 1478 env->cursor = first; 1479 inrange = 0; 1480 for (;;) 1481 { 1482 if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter) 1483 goto error; 1484 pp = env->cursor; 1485 env->cursor += (w = MBSIZE(env->cursor)); 1486 if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))) 1487 { 1488 if (*env->cursor) 1489 { 1490 if (*env->cursor == 'n') 1491 { 1492 pp = env->cursor++; 1493 c = '\n'; 1494 } 1495 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED)) 1496 { 1497 env->token.len = 1; 1498 w = magic(env, *env->cursor, 2); 1499 if (env->token.len > 1 || w != T_BAD) 1500 { 1501 if (env->token.len == 1 && (f = classfun(w))) 1502 { 1503 if (inrange > 1) 1504 { 1505 if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP))) 1506 goto erange; 1507 inrange = 0; 1508 } 1509 env->cursor++; 1510 ce->fun = f; 1511 ce->typ = COLL_call; 1512 ce++; 1513 continue; 1514 } 1515 if (env->token.len > 1 || w >= 0 && w < T_META) 1516 { 1517 c = w; 1518 w = mbconv(mbc, c); 1519 pp = (unsigned char*)mbc; 1520 env->cursor += env->token.len; 1521 } 1522 } 1523 } 1524 } 1525 } 1526 else if (c == ']') 1527 { 1528 if (env->cursor == begin) 1529 { 1530 rp = pp; 1531 rw = w; 1532 inrange = 1; 1533 continue; 1534 } 1535 if (inrange != 0) 1536 { 1537 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1538 if (inrange == 2) 1539 ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0); 1540 } 1541 break; 1542 } 1543 else if (c == '-') 1544 { 1545 if (!inrange && env->cursor != begin && *env->cursor != ']') 1546 { 1547 if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP))) 1548 goto erange; 1549 continue; 1550 } 1551 else if (inrange == 1) 1552 { 1553 inrange = 2; 1554 continue; 1555 } 1556 } 1557 else if (c == '[') 1558 { 1559 switch (*env->cursor) 1560 { 1561 case 0: 1562 goto error; 1563 case ':': 1564 if (env->flags & REG_REGEXP) 1565 goto complicated_normal; 1566 if (inrange == 1) 1567 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1568 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor))) 1569 { 1570 if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']') 1571 { 1572 switch (c) 1573 { 1574 case '<': 1575 i = REX_WBEG; 1576 break; 1577 case '>': 1578 i = REX_WEND; 1579 break; 1580 default: 1581 i = 0; 1582 break; 1583 } 1584 if (i) 1585 { 1586 env->cursor += 5; 1587 drop(env->disc, e); 1588 return node(env, i, 0, 0, 0); 1589 } 1590 } 1591 env->error = REG_ECTYPE; 1592 goto error; 1593 } 1594 ce->fun = f; 1595 ce->typ = COLL_call; 1596 ce++; 1597 inrange = 0; 1598 continue; 1599 case '=': 1600 if (env->flags & REG_REGEXP) 1601 goto complicated_normal; 1602 if (inrange == 2) 1603 goto erange; 1604 if (inrange == 1) 1605 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1606 pp = (unsigned char*)cb[inrange]; 1607 rp = env->cursor + 1; 1608 if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX, &wc)) < 0) 1609 goto ecollate; 1610 c = 0; 1611 if (ic) 1612 { 1613 if (iswupper(wc)) 1614 { 1615 wc = towlower(wc); 1616 rw = mbconv((char*)pp, wc); 1617 c = 'u'; 1618 } 1619 else if (iswlower(wc)) 1620 c = 'l'; 1621 } 1622 i = 1; 1623 for (;;) 1624 { 1625 mbxfrm(key.key, (char*)pp, COLL_KEY_MAX); 1626 if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key))) 1627 { 1628 if (i) 1629 { 1630 c = *pp; 1631 goto singleton; 1632 } 1633 goto ecollate; 1634 } 1635 xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc; 1636 if (c == 'l' || c == 'L' && !(c = 0)) 1637 ce->typ = COLL_range_lc; 1638 else if (c == 'u' || c == 'U' && !(c = 0)) 1639 ce->typ = COLL_range_uc; 1640 else 1641 ce->typ = COLL_range; 1642 strcpy((char*)ce->beg, (char*)xc->key); 1643 if (!(cc = (Cchr_t*)dtnext(dt, cc))) 1644 { 1645 if (i) 1646 { 1647 c = *pp; 1648 goto singleton; 1649 } 1650 goto ecollate; 1651 } 1652 if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc))) 1653 cc = tc; 1654 strcpy((char*)ce->end, (char*)cc->key); 1655 ce->max = -1; 1656 ce++; 1657 if (!c) 1658 break; 1659 if (c == 'u') 1660 { 1661 wc = towlower(wc); 1662 c = 'L'; 1663 } 1664 else 1665 { 1666 wc = towupper(wc); 1667 c = 'U'; 1668 } 1669 rw = mbconv((char*)pp, wc); 1670 i = 0; 1671 } 1672 inrange = 0; 1673 c = *pp; 1674 continue; 1675 case '.': 1676 if (env->flags & REG_REGEXP) 1677 goto complicated_normal; 1678 pp = (unsigned char*)cb[inrange]; 1679 if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX, NiL)) < 0) 1680 goto ecollate; 1681 c = *pp; 1682 break; 1683 default: 1684 complicated_normal: 1685 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)) 1686 goto error; 1687 break; 1688 } 1689 } 1690 singleton: 1691 if (inrange == 2) 1692 { 1693 ce = col(ce, ic, rp, rw, rc, pp, w, c); 1694 if (strcmp((char*)ce->beg, (char*)ce->end) > 0) 1695 { 1696 if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP))) 1697 goto erange; 1698 (ce-1)->typ = COLL_char; 1699 strcpy((char*)ce->beg, (char*)(ce-1)->end); 1700 ce->typ = COLL_char; 1701 ce++; 1702 } 1703 inrange = env->type >= SRE || (env->flags & (REG_LENIENT|REG_REGEXP)); 1704 } 1705 else if (inrange == 1) 1706 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0); 1707 else 1708 inrange = 1; 1709 rp = pp; 1710 rw = w; 1711 rc = c; 1712 } 1713 ce->typ = COLL_end; 1714 return e; 1715 } 1716 } 1717 #endif 1718 if (collate) 1719 goto ecollate; 1720 if (env->flags & REG_ICASE) 1721 for (i = 0; i <= UCHAR_MAX; i++) 1722 if (settst(e->re.charclass, i)) 1723 { 1724 if (isupper(i)) 1725 c = tolower(i); 1726 else if (islower(i)) 1727 c = toupper(i); 1728 else 1729 continue; 1730 setadd(e->re.charclass, c); 1731 } 1732 if (neg) 1733 { 1734 for (i = 0; i < elementsof(e->re.charclass->bits); i++) 1735 e->re.charclass->bits[i] ^= ~0; 1736 if (env->explicit >= 0) 1737 setclr(e->re.charclass, env->explicit); 1738 } 1739 return e; 1740 ecollate: 1741 env->error = REG_ECOLLATE; 1742 goto error; 1743 erange: 1744 env->error = REG_ERANGE; 1745 error: 1746 drop(env->disc, e); 1747 if (!env->error) 1748 env->error = REG_EBRACK; 1749 return 0; 1750 } 1751 1752 static Rex_t* 1753 ccl(Cenv_t* env, int type) 1754 { 1755 int i; 1756 Rex_t* e; 1757 Celt_t* ce; 1758 regclass_t f; 1759 1760 if (!(f = classfun(type))) 1761 { 1762 env->error = REG_BADESC; 1763 return 0; 1764 } 1765 if (!mbcoll()) 1766 { 1767 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t)))) 1768 return 0; 1769 for (i = 0; i <= UCHAR_MAX; i++) 1770 if ((*f)(i)) 1771 setadd(e->re.charclass, i); 1772 if (env->explicit >= 0) 1773 setclr(e->re.charclass, env->explicit); 1774 } 1775 else 1776 { 1777 if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t)))) 1778 return 0; 1779 ce = (Celt_t*)e->re.data; 1780 e->re.collate.invert = 0; 1781 e->re.collate.elements = ce; 1782 ce->fun = f; 1783 ce->typ = COLL_call; 1784 ce++; 1785 ce->typ = COLL_end; 1786 } 1787 return e; 1788 } 1789 1790 static Rex_t* 1791 rep(Cenv_t* env, Rex_t* e, int number, int last) 1792 { 1793 Rex_t* f; 1794 unsigned long m = 0; 1795 unsigned long n = RE_DUP_INF; 1796 int minimal = -1; 1797 1798 if (!e) 1799 return 0; 1800 switch (token(env)) 1801 { 1802 case T_BANG: 1803 eat(env); 1804 if (!(f = node(env, REX_NEG, m, n, 0))) 1805 { 1806 drop(env->disc, e); 1807 return 0; 1808 } 1809 f->re.group.expr.rex = e; 1810 return f; 1811 case T_QUES: 1812 eat(env); 1813 n = 1; 1814 break; 1815 case T_STAR: 1816 eat(env); 1817 break; 1818 case T_PLUS: 1819 eat(env); 1820 m = 1; 1821 break; 1822 case T_LEFT: 1823 eat(env); 1824 m = env->token.min; 1825 n = env->token.max; 1826 break; 1827 default: 1828 return e; 1829 } 1830 if (env->token.att) 1831 minimal = 1; 1832 else if (env->type < SRE) 1833 switch (token(env)) 1834 { 1835 case T_QUES: 1836 eat(env); 1837 minimal = !(env->flags & REG_MINIMAL); 1838 break; 1839 case T_STAR: /*AST*/ 1840 eat(env); 1841 minimal = !!(env->flags & REG_MINIMAL); 1842 break; 1843 } 1844 switch (e->type) 1845 { 1846 case REX_DOT: 1847 case REX_CLASS: 1848 case REX_COLL_CLASS: 1849 case REX_ONECHAR: 1850 e->lo = m; 1851 e->hi = n; 1852 if (minimal >= 0) 1853 mark(e, minimal); 1854 return e; 1855 #if HUH_2002_08_07 1856 case REX_BEG: 1857 #endif 1858 case REX_BEG_STR: 1859 case REX_END_STR: 1860 case REX_FIN_STR: 1861 case REX_WBEG: 1862 case REX_WEND: 1863 case REX_WORD: 1864 case REX_WORD_NOT: 1865 env->error = REG_BADRPT; 1866 drop(env->disc, e); 1867 return 0; 1868 } 1869 if (m == 1 && n == 1) 1870 { 1871 if (minimal >= 0) 1872 mark(e, minimal); 1873 return e; 1874 } 1875 if (!(f = node(env, REX_REP, m, n, 0))) 1876 { 1877 drop(env->disc, e); 1878 return 0; 1879 } 1880 f->re.group.expr.rex = e; 1881 f->re.group.number = number; 1882 f->re.group.last = last; 1883 if (minimal >= 0) 1884 mark(f, minimal); 1885 if (m <= n && n) 1886 { 1887 for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex); 1888 if (e && e->type == REX_NEG) 1889 f->type = REX_GROUP; 1890 } 1891 return f; 1892 } 1893 1894 static int 1895 isstring(Rex_t* e) 1896 { 1897 switch (e->type) 1898 { 1899 case REX_ONECHAR: 1900 return e->lo == 1 && e->hi == 1; 1901 case REX_STRING: 1902 return 1; 1903 } 1904 return 0; 1905 } 1906 1907 static Trie_node_t* 1908 trienode(Cenv_t* env, int c) 1909 { 1910 Trie_node_t* t; 1911 1912 if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t))) 1913 { 1914 memset(t, 0, sizeof(Trie_node_t)); 1915 t->c = c; 1916 } 1917 return t; 1918 } 1919 1920 static int 1921 insert(Cenv_t* env, Rex_t* f, Rex_t* g) 1922 { 1923 unsigned char* s; 1924 unsigned char* e; 1925 Trie_node_t* t; 1926 int len; 1927 unsigned char tmp[2]; 1928 1929 switch (f->type) 1930 { 1931 case REX_ONECHAR: 1932 *(s = tmp) = f->re.onechar; 1933 e = s + 1; 1934 break; 1935 case REX_STRING: 1936 s = f->re.string.base; 1937 e = s + f->re.string.size; 1938 break; 1939 default: 1940 return 1; 1941 } 1942 if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s))) 1943 return 1; 1944 for (len = 1;;) 1945 { 1946 if (t->c == *s) 1947 { 1948 if (++s >= e) 1949 break; 1950 if (!t->son && !(t->son = trienode(env, *s))) 1951 return 1; 1952 t = t->son; 1953 len++; 1954 } 1955 else 1956 { 1957 if (!t->sib && !(t->sib = trienode(env, *s))) 1958 return 1; 1959 t = t->sib; 1960 } 1961 } 1962 if (g->re.trie.min > len) 1963 g->re.trie.min = len; 1964 if (g->re.trie.max < len) 1965 g->re.trie.max = len; 1966 t->end = 1; 1967 return 0; 1968 } 1969 1970 /* 1971 * trie() tries to combine nontrivial e and f into a REX_TRIE 1972 * unless 0 is returned, e and f are deleted as far as possible 1973 * also called by regcomb 1974 */ 1975 1976 static Rex_t* 1977 trie(Cenv_t* env, Rex_t* e, Rex_t* f) 1978 { 1979 Rex_t* g; 1980 1981 if (e->next || f->next || !isstring(e) || e->flags != f->flags) 1982 return 0; 1983 if (isstring(f)) 1984 { 1985 if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*)))) 1986 return 0; 1987 g->re.trie.min = INT_MAX; 1988 if (insert(env, f, g)) 1989 goto nospace; 1990 drop(env->disc, f); 1991 } 1992 else if (f->type != REX_TRIE) 1993 return 0; 1994 else 1995 g = f; 1996 if (insert(env, e, g)) 1997 goto nospace; 1998 drop(env->disc, e); 1999 return g; 2000 nospace: 2001 if (g != f) 2002 drop(env->disc, g); 2003 return 0; 2004 } 2005 2006 static Rex_t* alt(Cenv_t*, int, int); 2007 2008 static int 2009 chr(register Cenv_t* env, int* escaped) 2010 { 2011 unsigned char* p; 2012 int c; 2013 2014 *escaped = 0; 2015 if (!(c = *env->cursor)) 2016 return -1; 2017 env->cursor++; 2018 if (c == '\\') 2019 { 2020 if (env->flags & REG_SHELL_ESCAPED) 2021 return c; 2022 if (!(c = *(env->cursor + 1)) || c == env->terminator) 2023 { 2024 if (env->flags & (REG_LENIENT|REG_REGEXP)) 2025 return c ? c : '\\'; 2026 env->error = REG_EESCAPE; 2027 return -1; 2028 } 2029 p = env->cursor; 2030 c = chresc((char*)env->cursor - 1, (char**)&env->cursor); 2031 *escaped = env->cursor - p; 2032 } 2033 return c; 2034 } 2035 2036 /* 2037 * open the perly gates 2038 */ 2039 2040 static Rex_t* 2041 grp(Cenv_t* env, int parno) 2042 { 2043 Rex_t* e; 2044 Rex_t* f; 2045 int c; 2046 int g; 2047 int i; 2048 int n; 2049 int x; 2050 int esc; 2051 int typ; 2052 int beg; 2053 unsigned char* p; 2054 2055 g = env->flags; 2056 beg = env->pattern == env->cursor - env->token.len; 2057 if (!(c = env->token.lex) && (c = *env->cursor)) 2058 env->cursor++; 2059 env->token.len = 0; 2060 env->parnest++; 2061 typ = -1; 2062 switch (c) 2063 { 2064 case '-': 2065 case '+': 2066 case 'a': 2067 case 'g': 2068 case 'i': 2069 case 'l': 2070 case 'm': 2071 case 'p': 2072 case 'r': 2073 case 's': 2074 case 'x': 2075 case 'A': 2076 case 'B': 2077 case 'E': 2078 case 'F': 2079 case 'G': 2080 case 'I': 2081 case 'K': 2082 case 'L': 2083 case 'M': /* glob(3) */ 2084 case 'N': /* glob(3) */ 2085 case 'O': /* glob(3) */ 2086 case 'P': 2087 case 'R': /* pcre */ 2088 case 'S': 2089 case 'U': /* pcre */ 2090 case 'V': 2091 case 'X': /* pcre */ 2092 x = REX_GROUP; 2093 i = 1; 2094 env->token.push = 1; 2095 for (;;) 2096 { 2097 switch (c) 2098 { 2099 case ')': 2100 if (!(env->flags & REG_LITERAL)) 2101 { 2102 env->error = REG_BADRPT; 2103 return 0; 2104 } 2105 /*FALLTHROUGH*/ 2106 case 0: 2107 case T_CLOSE: 2108 x = 0; 2109 goto done; 2110 case ':': 2111 eat(env); 2112 if (token(env) == T_CLOSE) 2113 x = 0; 2114 goto done; 2115 case '-': 2116 i = 0; 2117 break; 2118 case '+': 2119 i = 1; 2120 break; 2121 case 'a': 2122 if (i) 2123 env->flags |= (REG_LEFT|REG_RIGHT); 2124 else 2125 env->flags &= ~(REG_LEFT|REG_RIGHT); 2126 break; 2127 case 'g': 2128 if (i) 2129 env->flags &= ~REG_MINIMAL; 2130 else 2131 env->flags |= REG_MINIMAL; 2132 break; 2133 case 'i': 2134 if (i) 2135 env->flags |= REG_ICASE; 2136 else 2137 env->flags &= ~REG_ICASE; 2138 break; 2139 case 'l': 2140 if (i) 2141 env->flags |= REG_LEFT; 2142 else 2143 env->flags &= ~REG_LEFT; 2144 break; 2145 case 'm': 2146 if (i) 2147 env->flags |= REG_NEWLINE; 2148 else 2149 env->flags &= ~REG_NEWLINE; 2150 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 2151 break; 2152 case 'p': 2153 if (i) 2154 env->flags &= ~REG_LENIENT; 2155 else 2156 env->flags |= REG_LENIENT; 2157 break; 2158 case 'r': 2159 if (i) 2160 env->flags |= REG_RIGHT; 2161 else 2162 env->flags &= ~REG_RIGHT; 2163 break; 2164 case 's': 2165 if (i) 2166 env->flags |= REG_SPAN; 2167 else 2168 env->flags &= ~REG_SPAN; 2169 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1; 2170 break; 2171 case 'x': 2172 if (i) 2173 env->flags |= REG_COMMENT; 2174 else 2175 env->flags &= ~REG_COMMENT; 2176 break; 2177 case 'X': 2178 if (typ >= 0 || env->type == ERE && (env->flags & REG_CLASS_ESCAPE)) 2179 break; /* PCRE_EXTRA */ 2180 /*FALLTHROUGH*/ 2181 case 'A': 2182 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT); 2183 env->flags |= REG_AUGMENTED|REG_EXTENDED; 2184 typ = ARE; 2185 break; 2186 case 'B': 2187 case 'G': 2188 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT); 2189 typ = BRE; 2190 break; 2191 case 'E': 2192 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT); 2193 env->flags |= REG_EXTENDED; 2194 typ = ERE; 2195 break; 2196 case 'F': 2197 case 'L': 2198 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT); 2199 env->flags |= REG_LITERAL; 2200 typ = ERE; 2201 break; 2202 case 'K': 2203 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT); 2204 env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT; 2205 typ = KRE; 2206 break; 2207 case 'M': 2208 /* used by caller to disable glob(3) GLOB_BRACE */ 2209 break; 2210 case 'N': 2211 /* used by caller to disable glob(3) GLOB_NOCHECK */ 2212 break; 2213 case 'O': 2214 /* used by caller to disable glob(3) GLOB_STARSTAR */ 2215 break; 2216 case 'P': 2217 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT); 2218 env->flags |= REG_EXTENDED|REG_CLASS_ESCAPE; 2219 typ = ERE; 2220 break; 2221 case 'S': 2222 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT); 2223 env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT; 2224 typ = SRE; 2225 break; 2226 case 'U': /* PCRE_UNGREEDY */ 2227 if (typ >= 0 || env->type == ERE && (env->flags & REG_CLASS_ESCAPE)) 2228 { 2229 if (i) 2230 env->flags |= REG_MINIMAL; 2231 else 2232 env->flags &= ~REG_MINIMAL; 2233 } 2234 break; 2235 case 'V': 2236 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT); 2237 env->flags |= REG_REGEXP; 2238 typ = BRE; 2239 break; 2240 default: 2241 env->error = REG_BADRPT; 2242 return 0; 2243 } 2244 eat(env); 2245 c = token(env); 2246 } 2247 done: 2248 break; 2249 case ':': 2250 x = REX_GROUP; 2251 break; 2252 case '=': 2253 x = REX_GROUP_AHEAD; 2254 break; 2255 case '!': 2256 x = REX_GROUP_AHEAD_NOT; 2257 break; 2258 case '<': 2259 switch (token(env)) 2260 { 2261 case '=': 2262 x = REX_GROUP_BEHIND; 2263 break; 2264 case '!': 2265 case T_BANG: 2266 x = REX_GROUP_BEHIND_NOT; 2267 break; 2268 default: 2269 env->error = REG_BADRPT; 2270 return 0; 2271 } 2272 eat(env); 2273 break; 2274 case '>': 2275 x = REX_GROUP_CUT; 2276 break; 2277 case '%': 2278 case T_PERCENT: 2279 e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short)); 2280 e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor; 2281 n = 1; 2282 for (;;) 2283 { 2284 switch (i = chr(env, &esc)) 2285 { 2286 case -1: 2287 case 0: 2288 invalid: 2289 env->cursor -= esc + 1; 2290 env->error = REG_EPAREN; 2291 return 0; 2292 case 'D': 2293 x = REX_NEST_delimiter; 2294 /*FALLTHROUGH*/ 2295 delimiter: 2296 if ((i = chr(env, &esc)) < 0) 2297 goto invalid; 2298 if (e->re.nest.type[i] & ~x) 2299 goto invalid; 2300 e->re.nest.type[i] = x; 2301 continue; 2302 case 'E': 2303 x = REX_NEST_escape; 2304 goto delimiter; 2305 case 'L': 2306 x = REX_NEST_literal; 2307 goto quote; 2308 case 'O': 2309 switch (i = chr(env, &esc)) 2310 { 2311 case 'T': 2312 e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator; 2313 break; 2314 default: 2315 goto invalid; 2316 } 2317 continue; 2318 case 'Q': 2319 x = REX_NEST_quote; 2320 /*FALLTHROUGH*/ 2321 quote: 2322 if ((i = chr(env, &esc)) < 0) 2323 goto invalid; 2324 if (e->re.nest.type[i] & ~x) 2325 goto invalid; 2326 e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT); 2327 continue; 2328 case 'S': 2329 x = REX_NEST_separator; 2330 goto delimiter; 2331 case 'T': 2332 x = REX_NEST_terminator; 2333 goto delimiter; 2334 case '|': 2335 case '&': 2336 if (!esc) 2337 goto invalid; 2338 goto nesting; 2339 case '(': 2340 if (!esc) 2341 n++; 2342 goto nesting; 2343 case ')': 2344 if (!esc && !--n) 2345 break; 2346 goto nesting; 2347 default: 2348 nesting: 2349 if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 2350 goto invalid; 2351 e->re.nest.type[i] = REX_NEST_open; 2352 if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))) 2353 goto invalid; 2354 if (!esc) 2355 { 2356 if (x == ')' && !--n) 2357 goto invalid; 2358 else if (x == '(') 2359 n++; 2360 } 2361 e->re.nest.type[x] |= REX_NEST_close; 2362 e->re.nest.type[i] |= x << REX_NEST_SHIFT; 2363 continue; 2364 } 2365 break; 2366 } 2367 env->parnest--; 2368 if (c == T_PERCENT) 2369 for (n = 0; n < 2; n++) 2370 { 2371 parno = ++env->parno; 2372 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2373 { 2374 drop(env->disc, e); 2375 return 0; 2376 } 2377 if (parno < elementsof(env->paren)) 2378 env->paren[parno] = f; 2379 f->re.group.back = 0; 2380 f->re.group.number = parno; 2381 f->re.group.expr.rex = e; 2382 e = f; 2383 } 2384 return e; 2385 case '(': 2386 c = 0; 2387 if (isdigit(*env->cursor)) 2388 { 2389 f = 0; 2390 do 2391 { 2392 if (c > (INT_MAX / 10)) 2393 { 2394 env->error = REG_BADRPT; 2395 return 0; 2396 } 2397 c = c * 10 + (*env->cursor++ - '0'); 2398 } while (isdigit(*env->cursor)); 2399 if (*env->cursor++ != ')') 2400 { 2401 env->error = REG_BADRPT; 2402 return 0; 2403 } 2404 if (c && env->type >= SRE) 2405 c = c * 2 - 1; 2406 if (!c || c > env->parno || !env->paren[c]) 2407 { 2408 if (!(env->flags & (REG_LENIENT|REG_REGEXP))) 2409 { 2410 env->error = REG_ESUBREG; 2411 return 0; 2412 } 2413 if (c) 2414 c = -1; 2415 } 2416 } 2417 else 2418 { 2419 if (env->type < SRE && *env->cursor++ != '?') 2420 { 2421 env->error = REG_BADRPT; 2422 return 0; 2423 } 2424 if (!(f = grp(env, parno + 1)) && env->error) 2425 return 0; 2426 } 2427 if (!(e = node(env, REX_GROUP_COND, 0, 0, 0))) 2428 { 2429 drop(env->disc, f); 2430 return 0; 2431 } 2432 e->re.group.size = c; 2433 e->re.group.expr.binary.left = f; 2434 if (!(e->re.group.expr.binary.right = alt(env, parno, 1))) 2435 { 2436 drop(env->disc, e); 2437 return 0; 2438 } 2439 if (token(env) != T_CLOSE) 2440 { 2441 env->error = REG_EPAREN; 2442 return 0; 2443 } 2444 eat(env); 2445 env->parnest--; 2446 return rep(env, e, parno, parno); 2447 case '{': 2448 p = env->cursor; 2449 n = 1; 2450 while (c = *env->cursor) 2451 { 2452 if (c == '\\' && *(env->cursor + 1)) 2453 env->cursor++; 2454 else if (c == '{') 2455 n++; 2456 else if (c == '}' && !--n) 2457 break; 2458 else if (c == env->delimiter || c == env->terminator) 2459 break; 2460 env->cursor++; 2461 } 2462 if (c != '}') 2463 { 2464 env->error = REG_EBRACE; 2465 return 0; 2466 } 2467 if (*++env->cursor != ')') 2468 { 2469 env->error = REG_EPAREN; 2470 return 0; 2471 } 2472 env->cursor++; 2473 env->parnest--; 2474 if (env->disc->re_version < REG_VERSION_EXEC) 2475 { 2476 env->error = REG_BADRPT; 2477 return 0; 2478 } 2479 if (!env->disc->re_execf) 2480 return 0; 2481 if (!(e = node(env, REX_EXEC, 0, 0, 0))) 2482 return 0; 2483 e->re.exec.text = (const char*)p; 2484 e->re.exec.size = env->cursor - p - 2; 2485 if (!env->disc->re_compf) 2486 e->re.exec.data = 0; 2487 else 2488 e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc); 2489 return e; 2490 case '0': case '1': case '2': case '3': case '4': 2491 case '5': case '6': case '7': case '8': case '9': 2492 c -= '0'; 2493 while (isdigit(*env->cursor)) 2494 { 2495 if (c > (INT_MAX / 10)) 2496 { 2497 env->error = REG_ESUBREG; 2498 return 0; 2499 } 2500 c = c * 10 + *env->cursor++ - '0'; 2501 } 2502 if (*env->cursor == ')') 2503 { 2504 env->cursor++; 2505 env->parnest--; 2506 env->token.len = 1; 2507 if (c > env->parno || !env->paren[c]) 2508 { 2509 env->error = REG_ESUBREG; 2510 return 0; 2511 } 2512 env->paren[c]->re.group.back = 1; 2513 return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 2514 } 2515 /*FALLTHROUGH*/ 2516 default: 2517 env->error = REG_BADRPT; 2518 return 0; 2519 } 2520 p = env->pattern; 2521 i = env->type; 2522 if (x) 2523 { 2524 if (typ >= 0) 2525 env->type = typ; 2526 if (!(e = alt(env, parno, 0))) 2527 goto nope; 2528 env->flags = g; 2529 env->type = i; 2530 } 2531 c = token(env); 2532 env->parnest--; 2533 if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')')) 2534 { 2535 env->error = REG_EPAREN; 2536 goto nope; 2537 } 2538 eat(env); 2539 if (typ >= 0 && beg) 2540 env->pattern = env->cursor; 2541 if (!x) 2542 { 2543 if (typ >= 0) 2544 env->type = typ; 2545 return 0; 2546 } 2547 if (!(f = node(env, x, 0, 0, 0))) 2548 { 2549 drop(env->disc, e); 2550 goto nope; 2551 } 2552 f->re.group.expr.rex = e; 2553 if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT) 2554 { 2555 if (stats(env, e)) 2556 { 2557 drop(env->disc, f); 2558 if (!env->error) 2559 env->error = REG_ECOUNT; 2560 goto nope; 2561 } 2562 f->re.group.size = env->stats.m; 2563 memset(&env->stats, 0, sizeof(env->stats)); 2564 } 2565 switch (x) 2566 { 2567 case REX_GROUP: 2568 case REX_GROUP_CUT: 2569 f = rep(env, f, parno, env->parno); 2570 break; 2571 } 2572 if (f) 2573 return f; 2574 nope: 2575 env->flags = g; 2576 env->pattern = p; 2577 env->type = i; 2578 return 0; 2579 } 2580 2581 static Rex_t* 2582 seq(Cenv_t* env) 2583 { 2584 Rex_t* e; 2585 Rex_t* f; 2586 Token_t tok; 2587 int c; 2588 int i; 2589 int n; 2590 int x; 2591 int parno; 2592 int type; 2593 regflags_t flags; 2594 unsigned char* s; 2595 unsigned char* p; 2596 unsigned char* t; 2597 unsigned char* u; 2598 unsigned char buf[256]; 2599 2600 for (;;) 2601 { 2602 s = buf; 2603 while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len]) 2604 { 2605 x = c; 2606 p = env->cursor; 2607 if (c >= 0) 2608 { 2609 n = 1; 2610 *s++ = (env->flags & REG_ICASE) ? toupper(c) : c; 2611 } 2612 else if (c == C_ESC || (env->flags & REG_ICASE)) 2613 { 2614 c = (c == C_ESC) ? env->token.lex : mbchar(p); 2615 if (env->flags & REG_ICASE) 2616 c = towupper(c); 2617 if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX) 2618 break; 2619 if ((n = mbconv((char*)s, c)) < 0) 2620 *s++ = c; 2621 else if (n) 2622 s += n; 2623 else 2624 { 2625 n = 1; 2626 *s++ = 0; 2627 } 2628 } 2629 else 2630 { 2631 n = env->token.len - env->token.esc; 2632 for (t = p, u = s + n; s < u; *s++ = *t++); 2633 } 2634 eat(env); 2635 } 2636 if (c == T_BAD) 2637 return 0; 2638 if (s > buf) 2639 switch (c) 2640 { 2641 case T_STAR: 2642 case T_PLUS: 2643 case T_LEFT: 2644 case T_QUES: 2645 case T_BANG: 2646 if ((s -= n) == buf) 2647 e = 0; 2648 else 2649 { 2650 i = s - buf; 2651 if (!(e = node(env, REX_STRING, 0, 0, i))) 2652 return 0; 2653 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i); 2654 e->re.string.size = i; 2655 } 2656 if (x >= 0) 2657 { 2658 if (!(f = node(env, REX_ONECHAR, 1, 1, 0))) 2659 { 2660 drop(env->disc, e); 2661 return 0; 2662 } 2663 f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x; 2664 } 2665 else 2666 { 2667 if (!(f = node(env, REX_STRING, 0, 0, n))) 2668 return 0; 2669 memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n); 2670 f->re.string.size = n; 2671 } 2672 if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env)))) 2673 { 2674 drop(env->disc, e); 2675 return 0; 2676 } 2677 if (e) 2678 f = cat(env, e, f); 2679 return f; 2680 default: 2681 c = s - buf; 2682 if (!(e = node(env, REX_STRING, 0, 0, c))) 2683 return 0; 2684 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c); 2685 e->re.string.size = c; 2686 return cat(env, e, seq(env)); 2687 } 2688 else if (c > T_BACK) 2689 { 2690 eat(env); 2691 c -= T_BACK; 2692 if (c > env->parno || !env->paren[c]) 2693 { 2694 env->error = REG_ESUBREG; 2695 return 0; 2696 } 2697 env->paren[c]->re.group.back = 1; 2698 e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0); 2699 } 2700 else 2701 switch (c) 2702 { 2703 case T_AND: 2704 case T_CLOSE: 2705 case T_BAR: 2706 case T_END: 2707 return node(env, REX_NULL, 0, 0, 0); 2708 case T_DOLL: 2709 eat(env); 2710 e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0); 2711 break; 2712 case T_CFLX: 2713 eat(env); 2714 if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED)) 2715 e = rep(env, e, 0, 0); 2716 break; 2717 case T_OPEN: 2718 tok = env->token; 2719 eat(env); 2720 flags = env->flags; 2721 type = env->type; 2722 if (env->token.att) 2723 env->flags |= REG_MINIMAL; 2724 env->parnest++; 2725 if (env->type == KRE) 2726 ++env->parno; 2727 parno = ++env->parno; 2728 if (!(e = alt(env, parno + 1, 0))) 2729 break; 2730 if (e->type == REX_NULL && env->type == ERE && !(env->flags & (REG_NULL|REG_REGEXP))) 2731 { 2732 drop(env->disc, e); 2733 env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL; 2734 return 0; 2735 } 2736 if (token(env) != T_CLOSE) 2737 { 2738 drop(env->disc, e); 2739 env->error = REG_EPAREN; 2740 return 0; 2741 } 2742 env->parnest--; 2743 eat(env); 2744 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2745 { 2746 drop(env->disc, e); 2747 return 0; 2748 } 2749 if (parno < elementsof(env->paren)) 2750 env->paren[parno] = f; 2751 f->re.group.back = 0; 2752 f->re.group.number = parno; 2753 f->re.group.expr.rex = e; 2754 if (tok.lex) 2755 { 2756 tok.push = 1; 2757 env->token = tok; 2758 } 2759 if (!(e = rep(env, f, parno, env->parno))) 2760 return 0; 2761 if (env->type == KRE) 2762 { 2763 if (!(f = node(env, REX_GROUP, 0, 0, 0))) 2764 { 2765 drop(env->disc, e); 2766 return 0; 2767 } 2768 if (--parno < elementsof(env->paren)) 2769 env->paren[parno] = f; 2770 f->re.group.back = 0; 2771 f->re.group.number = parno; 2772 f->re.group.expr.rex = e; 2773 e = f; 2774 } 2775 env->flags = flags; 2776 env->type = type; 2777 break; 2778 case T_GROUP: 2779 p = env->cursor; 2780 eat(env); 2781 flags = env->flags; 2782 type = env->type; 2783 if (!(e = grp(env, env->parno + 1))) 2784 { 2785 if (env->error) 2786 return 0; 2787 if (env->literal == env->pattern && env->literal == p) 2788 env->literal = env->cursor; 2789 continue; 2790 } 2791 env->flags = flags; 2792 env->type = type; 2793 break; 2794 case T_BRA: 2795 eat(env); 2796 if (e = bra(env)) 2797 e = rep(env, e, 0, 0); 2798 break; 2799 case T_ALNUM: 2800 case T_ALNUM_NOT: 2801 case T_DIGIT: 2802 case T_DIGIT_NOT: 2803 case T_SPACE: 2804 case T_SPACE_NOT: 2805 eat(env); 2806 if (e = ccl(env, c)) 2807 e = rep(env, e, 0, 0); 2808 break; 2809 case T_LT: 2810 eat(env); 2811 e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0); 2812 break; 2813 case T_GT: 2814 eat(env); 2815 e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0); 2816 break; 2817 case T_DOT: 2818 eat(env); 2819 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 2820 break; 2821 case T_DOTSTAR: 2822 eat(env); 2823 env->token.lex = T_STAR; 2824 env->token.push = 1; 2825 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0); 2826 break; 2827 case T_SLASHPLUS: 2828 eat(env); 2829 env->token.lex = T_PLUS; 2830 env->token.push = 1; 2831 if (e = node(env, REX_ONECHAR, 1, 1, 0)) 2832 { 2833 e->re.onechar = '/'; 2834 e = rep(env, e, 0, 0); 2835 } 2836 break; 2837 case T_WORD: 2838 eat(env); 2839 e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0); 2840 break; 2841 case T_WORD_NOT: 2842 eat(env); 2843 e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0); 2844 break; 2845 case T_BEG_STR: 2846 eat(env); 2847 e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0); 2848 break; 2849 case T_END_STR: 2850 eat(env); 2851 e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0); 2852 break; 2853 case T_FIN_STR: 2854 eat(env); 2855 e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0); 2856 break; 2857 default: 2858 env->error = REG_BADRPT; 2859 return 0; 2860 } 2861 if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator) 2862 e = cat(env, e, seq(env)); 2863 return e; 2864 } 2865 } 2866 2867 static Rex_t* 2868 con(Cenv_t* env) 2869 { 2870 Rex_t* e; 2871 Rex_t* f; 2872 Rex_t* g; 2873 2874 if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND) 2875 return e; 2876 eat(env); 2877 if (!(f = con(env))) 2878 { 2879 drop(env->disc, e); 2880 return 0; 2881 } 2882 if (!(g = node(env, REX_CONJ, 0, 0, 0))) 2883 { 2884 drop(env->disc, e); 2885 drop(env->disc, f); 2886 return 0; 2887 } 2888 g->re.group.expr.binary.left = e; 2889 g->re.group.expr.binary.right = f; 2890 return g; 2891 } 2892 2893 static Rex_t* 2894 alt(Cenv_t* env, int number, int cond) 2895 { 2896 Rex_t* e; 2897 Rex_t* f; 2898 Rex_t* g; 2899 2900 if (!(e = con(env))) 2901 return 0; 2902 else if (token(env) != T_BAR) 2903 { 2904 if (!cond) 2905 return e; 2906 f = 0; 2907 if (e->type == REX_NULL) 2908 goto bad; 2909 } 2910 else 2911 { 2912 eat(env); 2913 if (!(f = alt(env, number, 0))) 2914 { 2915 drop(env->disc, e); 2916 return 0; 2917 } 2918 if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & (REG_NULL|REG_REGEXP))) 2919 goto bad; 2920 if (!cond && (g = trie(env, e, f))) 2921 return g; 2922 } 2923 if (!(g = node(env, REX_ALT, 0, 0, 0))) 2924 { 2925 env->error = REG_ESPACE; 2926 goto bad; 2927 } 2928 g->re.group.number = number; 2929 g->re.group.last = env->parno; 2930 g->re.group.expr.binary.left = e; 2931 g->re.group.expr.binary.right = f; 2932 return g; 2933 bad: 2934 drop(env->disc, e); 2935 drop(env->disc, f); 2936 if (!env->error) 2937 env->error = REG_ENULL; 2938 return 0; 2939 } 2940 2941 /* 2942 * add v to REX_BM tables 2943 */ 2944 2945 static void 2946 bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b) 2947 { 2948 int c; 2949 int m; 2950 size_t z; 2951 2952 for (m = 0; m < n; m++) 2953 { 2954 if (!(z = n - m - 1)) 2955 z = HIT; 2956 c = v[m]; 2957 a->re.bm.mask[m][c] |= b; 2958 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 2959 a->re.bm.skip[c] = z; 2960 if (a->flags & REG_ICASE) 2961 { 2962 if (isupper(c)) 2963 c = tolower(c); 2964 else if (islower(c)) 2965 c = toupper(c); 2966 else 2967 continue; 2968 a->re.bm.mask[m][c] |= b; 2969 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT) 2970 a->re.bm.skip[c] = z; 2971 } 2972 } 2973 } 2974 2975 /* 2976 * set up BM table from trie 2977 */ 2978 2979 static int 2980 bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b) 2981 { 2982 do 2983 { 2984 v[m] = x->c; 2985 if (m >= (n - 1)) 2986 { 2987 bmstr(env, a, v, n, b); 2988 if (!(b <<= 1)) 2989 { 2990 b = 1; 2991 a->re.bm.complete = 0; 2992 } 2993 else if (x->son) 2994 a->re.bm.complete = 0; 2995 } 2996 else if (x->son) 2997 b = bmtrie(env, a, v, x->son, n, m + 1, b); 2998 } while (x = x->sib); 2999 return b; 3000 } 3001 3002 /* 3003 * rewrite the expression tree for some special cases 3004 * 1. it is a null expression - illegal 3005 * 2. max length fixed string found -- use BM algorithm 3006 * 3. it begins with an unanchored string - use KMP algorithm 3007 * 0 returned on success 3008 */ 3009 3010 static int 3011 special(Cenv_t* env, regex_t* p) 3012 { 3013 Rex_t* a; 3014 Rex_t* e; 3015 Rex_t* t; 3016 Rex_t* x; 3017 Rex_t* y; 3018 unsigned char* s; 3019 int* f; 3020 int n; 3021 int m; 3022 int k; 3023 3024 DEBUG_INIT(); 3025 if (e = p->env->rex) 3026 { 3027 if ((x = env->stats.x) && x->re.string.size < 3) 3028 x = 0; 3029 if ((t = env->stats.y) && t->re.trie.min < 3) 3030 t = 0; 3031 if (x && t) 3032 { 3033 if (x->re.string.size >= t->re.trie.min) 3034 t = 0; 3035 else 3036 x = 0; 3037 } 3038 if (x || t) 3039 { 3040 Bm_mask_t** mask; 3041 Bm_mask_t* h; 3042 unsigned char* v; 3043 size_t* q; 3044 size_t l; 3045 int i; 3046 int j; 3047 3048 if (x) 3049 { 3050 y = x; 3051 n = m = x->re.string.size; 3052 l = env->stats.l; 3053 } 3054 else 3055 { 3056 y = t; 3057 n = t->re.trie.min; 3058 m = t->re.trie.max; 3059 l = env->stats.k; 3060 } 3061 if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t)))) 3062 return 1; 3063 if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t)))) 3064 { 3065 alloc(env->disc, q, 0); 3066 return 1; 3067 } 3068 a->flags = y->flags; 3069 a->map = y->map; 3070 a->re.bm.size = n; 3071 a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1; 3072 a->re.bm.left = l - 1; 3073 a->re.bm.right = env->stats.m - l - n; 3074 a->re.bm.complete = (env->stats.e || y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n; 3075 h = (Bm_mask_t*)&a->re.bm.mask[n]; 3076 a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1)); 3077 a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1]; 3078 for (m = 0; m <= UCHAR_MAX; m++) 3079 a->re.bm.skip[m] = n; 3080 a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left); 3081 for (i = 1; i <= n; i++) 3082 a->re.bm.fail[i] = 2 * n - i; 3083 mask = a->re.bm.mask; 3084 for (m = 0; m < n; m++) 3085 { 3086 mask[m] = h; 3087 h += UCHAR_MAX + 1; 3088 } 3089 if (x) 3090 bmstr(env, a, x->re.string.base, n, 1); 3091 else 3092 { 3093 v = (unsigned char*)q; 3094 memset(v, 0, n); 3095 m = 1; 3096 for (i = 0; i <= UCHAR_MAX; i++) 3097 if (t->re.trie.root[i]) 3098 m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m); 3099 } 3100 mask--; 3101 memset(q, 0, n * sizeof(*q)); 3102 for (k = (j = n) + 1; j > 0; j--, k--) 3103 { 3104 DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0)); 3105 for (q[j] = k; k <= n; k = q[k]) 3106 { 3107 for (m = 0; m <= UCHAR_MAX; m++) 3108 if (mask[k][m] == mask[j][m]) 3109 { 3110 DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0)); 3111 goto cut; 3112 } 3113 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0)); 3114 if (a->re.bm.fail[k] > n - j) 3115 a->re.bm.fail[k] = n - j; 3116 } 3117 cut: ; 3118 } 3119 for (i = 1; i <= n; i++) 3120 if (a->re.bm.fail[i] > n + k - i) 3121 { 3122 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0)); 3123 a->re.bm.fail[i] = n + k - i; 3124 } 3125 #if _AST_REGEX_DEBUG 3126 if (DEBUG_TEST(0x0020,(1),(0))) 3127 { 3128 sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0); 3129 for (m = 0; m < n; m++) 3130 for (i = 1; i <= UCHAR_MAX; i++) 3131 if (a->re.bm.mask[m][i]) 3132 sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]); 3133 for (i = ' '; i <= UCHAR_MAX; i++) 3134 if (a->re.bm.skip[i] >= HIT) 3135 sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i); 3136 else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n) 3137 sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]); 3138 for (j = 31; j >= 0; j--) 3139 { 3140 sfprintf(sfstderr, " "); 3141 next: 3142 for (m = 0; m < n; m++) 3143 { 3144 for (i = 0040; i < 0177; i++) 3145 if (a->re.bm.mask[m][i] & (1 << j)) 3146 { 3147 sfprintf(sfstderr, " %c", i); 3148 break; 3149 } 3150 if (i >= 0177) 3151 { 3152 if (j > 0) 3153 { 3154 j--; 3155 goto next; 3156 } 3157 sfprintf(sfstderr, " ?"); 3158 } 3159 } 3160 sfprintf(sfstderr, "\n"); 3161 } 3162 sfprintf(sfstderr, "FAIL: "); 3163 for (m = 1; m <= n; m++) 3164 sfprintf(sfstderr, "%3d", a->re.bm.fail[m]); 3165 sfprintf(sfstderr, "\n"); 3166 } 3167 #endif 3168 alloc(env->disc, q, 0); 3169 a->next = e; 3170 p->env->rex = a; 3171 return 0; 3172 } 3173 switch (e->type) 3174 { 3175 case REX_BEG: 3176 if (env->flags & REG_NEWLINE) 3177 return 0; 3178 break; 3179 case REX_GROUP: 3180 if (env->stats.b) 3181 return 0; 3182 e = e->re.group.expr.rex; 3183 if (e->type != REX_DOT) 3184 return 0; 3185 /*FALLTHROUGH*/ 3186 case REX_DOT: 3187 if (e->lo == 0 && e->hi == RE_DUP_INF) 3188 break; 3189 return 0; 3190 case REX_NULL: 3191 if (env->flags & (REG_NULL|REG_REGEXP)) 3192 break; 3193 env->error = REG_ENULL; 3194 return 1; 3195 case REX_STRING: 3196 if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map) 3197 return 0; 3198 s = e->re.string.base; 3199 n = e->re.string.size; 3200 if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1)))) 3201 return 1; 3202 a->flags = e->flags; 3203 a->map = e->map; 3204 f = a->re.string.fail; 3205 memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n); 3206 s = a->re.string.base; 3207 a->re.string.size = n; 3208 f[0] = m = -1; 3209 for (k = 1; k < n; k++) 3210 { 3211 while (m >= 0 && s[m+1] != s[k]) 3212 m = f[m]; 3213 if (s[m+1] == s[k]) 3214 m++; 3215 f[k] = m; 3216 } 3217 a->next = e->next; 3218 p->env->rex = a; 3219 e->next = 0; 3220 drop(env->disc, e); 3221 break; 3222 default: 3223 return 0; 3224 } 3225 } 3226 p->env->once = 1; 3227 return 0; 3228 } 3229 3230 int 3231 regcomp(regex_t* p, const char* pattern, regflags_t flags) 3232 { 3233 Rex_t* e; 3234 Rex_t* f; 3235 regdisc_t* disc; 3236 unsigned char* fold; 3237 int i; 3238 Cenv_t env; 3239 3240 if (!p) 3241 return REG_BADPAT; 3242 if (flags & REG_DISCIPLINE) 3243 { 3244 flags &= ~REG_DISCIPLINE; 3245 disc = p->re_disc; 3246 } 3247 else 3248 disc = &state.disc; 3249 if (!disc->re_errorlevel) 3250 disc->re_errorlevel = 2; 3251 p->env = 0; 3252 if (!pattern) 3253 return fatal(disc, REG_BADPAT, pattern); 3254 if (!state.initialized) 3255 { 3256 state.initialized = 1; 3257 for (i = 0; i < elementsof(state.escape); i++) 3258 state.magic[state.escape[i].key] = state.escape[i].val; 3259 } 3260 if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data)) 3261 { 3262 if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1))) 3263 return fatal(disc, REG_ESPACE, pattern); 3264 for (i = 0; i <= UCHAR_MAX; i++) 3265 fold[i] = toupper(i); 3266 LCINFO(AST_LC_CTYPE)->data = (void*)fold; 3267 } 3268 again: 3269 if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t)))) 3270 return fatal(disc, REG_ESPACE, pattern); 3271 memset(p->env, 0, sizeof(*p->env)); 3272 memset(&env, 0, sizeof(env)); 3273 env.regex = p; 3274 env.flags = flags; 3275 env.disc = p->env->disc = disc; 3276 if (env.flags & REG_AUGMENTED) 3277 env.flags |= REG_EXTENDED; 3278 env.mappeddot = '.'; 3279 env.mappednewline = '\n'; 3280 env.mappedslash = '/'; 3281 if (disc->re_version >= REG_VERSION_MAP && disc->re_map) 3282 { 3283 env.map = disc->re_map; 3284 env.MAP = p->env->fold; 3285 for (i = 0; i <= UCHAR_MAX; i++) 3286 { 3287 env.MAP[i] = fold[env.map[i]]; 3288 if (env.map[i] == '.') 3289 env.mappeddot = i; 3290 if (env.map[i] == '\n') 3291 env.mappednewline = i; 3292 if (env.map[i] == '/') 3293 env.mappedslash = i; 3294 } 3295 } 3296 else 3297 env.MAP = fold; 3298 env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE; 3299 env.explicit = -1; 3300 if (env.flags & REG_SHELL) 3301 { 3302 if (env.flags & REG_SHELL_PATH) 3303 env.explicit = env.mappedslash; 3304 if (!(env.flags & REG_SHELL_ESCAPED)) 3305 env.flags |= REG_CLASS_ESCAPE; 3306 env.flags |= REG_LENIENT|REG_NULL; 3307 env.type = env.type == BRE ? SRE : KRE; 3308 } 3309 else 3310 env.flags &= ~(REG_SHELL_DOT|REG_SHELL_ESCAPED|REG_SHELL_GROUP|REG_SHELL_PATH); 3311 if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE) 3312 env.explicit = env.mappednewline; 3313 p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1; 3314 env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL)); 3315 env.token.lex = 0; 3316 env.token.push = 0; 3317 if (env.flags & REG_DELIMITED) 3318 { 3319 switch (env.delimiter = *pattern++) 3320 { 3321 case 0: 3322 case '\\': 3323 case '\n': 3324 case '\r': 3325 env.error = REG_EDELIM; 3326 goto bad; 3327 } 3328 env.terminator = '\n'; 3329 } 3330 env.literal = env.pattern = env.cursor = (unsigned char*)pattern; 3331 if (!(p->env->rex = alt(&env, 1, 0))) 3332 goto bad; 3333 if (env.parnest) 3334 { 3335 env.error = REG_EPAREN; 3336 goto bad; 3337 } 3338 if ((env.flags & REG_LEFT) && p->env->rex->type != REX_BEG) 3339 { 3340 if (p->env->rex->type == REX_ALT) 3341 env.flags &= ~REG_FIRST; 3342 if (!(e = node(&env, REX_BEG, 0, 0, 0))) 3343 { 3344 regfree(p); 3345 return fatal(disc, REG_ESPACE, pattern); 3346 } 3347 e->next = p->env->rex; 3348 p->env->rex = e; 3349 p->env->once = 1; 3350 } 3351 for (e = p->env->rex; e->next; e = e->next); 3352 p->env->done.type = REX_DONE; 3353 p->env->done.flags = e->flags; 3354 if ((env.flags & REG_RIGHT) && e->type != REX_END) 3355 { 3356 if (p->env->rex->type == REX_ALT) 3357 env.flags &= ~REG_FIRST; 3358 if (!(f = node(&env, REX_END, 0, 0, 0))) 3359 { 3360 regfree(p); 3361 return fatal(disc, REG_ESPACE, pattern); 3362 } 3363 f->flags = e->flags; 3364 f->map = e->map; 3365 e->next = f; 3366 } 3367 if (stats(&env, p->env->rex)) 3368 { 3369 if (!env.error) 3370 env.error = REG_ECOUNT; 3371 goto bad; 3372 } 3373 if (env.stats.b) 3374 p->env->hard = p->env->separate = 1; 3375 else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c))) 3376 p->env->hard = 1; 3377 if (p->env->hard || env.stats.c || env.stats.i) 3378 p->env->stats.re_min = p->env->stats.re_max = -1; 3379 else 3380 { 3381 if (!(p->env->stats.re_min = env.stats.m)) 3382 p->env->stats.re_min = -1; 3383 if (!(p->env->stats.re_max = env.stats.n)) 3384 p->env->stats.re_max = -1; 3385 } 3386 if (special(&env, p)) 3387 goto bad; 3388 serialize(&env, p->env->rex, 1); 3389 p->re_nsub = env.stats.p; 3390 if (env.type == KRE) 3391 p->re_nsub /= 2; 3392 if (env.flags & REG_DELIMITED) 3393 { 3394 p->re_npat = env.cursor - env.pattern + 1; 3395 if (*env.cursor == env.delimiter) 3396 p->re_npat++; 3397 else if (env.flags & REG_MUSTDELIM) 3398 { 3399 env.error = REG_EDELIM; 3400 goto bad; 3401 } 3402 else 3403 env.flags &= ~REG_DELIMITED; 3404 } 3405 p->env->explicit = env.explicit; 3406 p->env->flags = env.flags & REG_COMP; 3407 p->env->min = env.stats.m; 3408 p->env->nsub = env.stats.p + env.stats.u; 3409 p->env->refs = 1; 3410 return 0; 3411 bad: 3412 regfree(p); 3413 if (!env.error) 3414 env.error = REG_ESPACE; 3415 if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL)) 3416 { 3417 flags |= REG_LITERAL; 3418 pattern = (const char*)env.literal; 3419 goto again; 3420 } 3421 return fatal(disc, env.error, pattern); 3422 } 3423 3424 /* 3425 * regcomp() on sized pattern 3426 * the lazy way around adding and checking env.end 3427 */ 3428 3429 int 3430 regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags) 3431 { 3432 char* s; 3433 int r; 3434 3435 if (!(s = malloc(size + 1))) 3436 return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern); 3437 memcpy(s, pattern, size); 3438 s[size] = 0; 3439 r = regcomp(p, s, flags); 3440 free(s); 3441 return r; 3442 } 3443 3444 /* 3445 * combine two compiled regular expressions if possible, 3446 * replacing first with the combination and freeing second. 3447 * return 0 on success. 3448 * the only combinations handled are building a Trie 3449 * from String|Kmp|Trie and String|Kmp 3450 */ 3451 3452 int 3453 regcomb(regex_t* p, regex_t* q) 3454 { 3455 Rex_t* e = p->env->rex; 3456 Rex_t* f = q->env->rex; 3457 Rex_t* g; 3458 Rex_t* h; 3459 Cenv_t env; 3460 3461 if (!e || !f) 3462 return fatal(p->env->disc, REG_BADPAT, NiL); 3463 if (p->env->separate || q->env->separate) 3464 return REG_ESUBREG; 3465 memset(&env, 0, sizeof(env)); 3466 env.disc = p->env->disc; 3467 if (e->type == REX_BM) 3468 { 3469 p->env->rex = e->next; 3470 e->next = 0; 3471 drop(env.disc, e); 3472 e = p->env->rex; 3473 } 3474 if (f->type == REX_BM) 3475 { 3476 q->env->rex = f->next; 3477 f->next = 0; 3478 drop(env.disc, f); 3479 f = q->env->rex; 3480 } 3481 if (e->type == REX_BEG && f->type == REX_BEG) 3482 { 3483 p->env->flags |= REG_LEFT; 3484 p->env->rex = e->next; 3485 e->next = 0; 3486 drop(env.disc, e); 3487 e = p->env->rex; 3488 q->env->rex = f->next; 3489 f->next = 0; 3490 drop(env.disc, f); 3491 f = q->env->rex; 3492 } 3493 for (g = e; g->next; g = g->next); 3494 for (h = f; h->next; h = h->next); 3495 if (g->next && g->next->type == REX_END && h->next && h->next->type == REX_END) 3496 { 3497 p->env->flags |= REG_RIGHT; 3498 drop(env.disc, g->next); 3499 g->next = 0; 3500 drop(env.disc, h->next); 3501 h->next = 0; 3502 } 3503 if (!(g = trie(&env, f, e))) 3504 return fatal(p->env->disc, REG_BADPAT, NiL); 3505 p->env->rex = g; 3506 if (!q->env->once) 3507 p->env->once = 0; 3508 q->env->rex = 0; 3509 if (p->env->flags & REG_LEFT) 3510 { 3511 if (!(e = node(&env, REX_BEG, 0, 0, 0))) 3512 { 3513 regfree(p); 3514 return fatal(p->env->disc, REG_ESPACE, NiL); 3515 } 3516 e->next = p->env->rex; 3517 p->env->rex = e; 3518 p->env->once = 1; 3519 } 3520 if (p->env->flags & REG_RIGHT) 3521 { 3522 for (f = p->env->rex; f->next; f = f->next); 3523 if (f->type != REX_END) 3524 { 3525 if (!(e = node(&env, REX_END, 0, 0, 0))) 3526 { 3527 regfree(p); 3528 return fatal(p->env->disc, REG_ESPACE, NiL); 3529 } 3530 f->next = e; 3531 } 3532 } 3533 env.explicit = p->env->explicit; 3534 env.flags = p->env->flags; 3535 env.disc = p->env->disc; 3536 if (stats(&env, p->env->rex)) 3537 { 3538 regfree(p); 3539 return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL); 3540 } 3541 if (special(&env, p)) 3542 { 3543 regfree(p); 3544 return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL); 3545 } 3546 p->env->min = g->re.trie.min; 3547 return 0; 3548 } 3549 3550 /* 3551 * copy a reference of p into q 3552 * p and q may then have separate regsubcomp() instantiations 3553 */ 3554 3555 int 3556 regdup(regex_t* p, regex_t* q) 3557 { 3558 if (!p || !q) 3559 return REG_BADPAT; 3560 *q = *p; 3561 p->env->refs++; 3562 q->re_sub = 0; 3563 return 0; 3564 } 3565