1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1985-2012 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Eclipse Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.eclipse.org/org/documents/epl-v10.html *
11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * Phong Vo <kpv@research.att.com> *
20 * *
21 ***********************************************************************/
22 #pragma prototyped
23
24 /*
25 * posix regex compiler
26 */
27
28 #include "reglib.h"
29
30 #if _PACKAGE_ast
31 #include "lclib.h"
32 #endif
33
34 #define serialize re_serialize /* hp.ia64 <unistd.h>! */
35
36 #define C_ESC (-1)
37 #define C_MB (-2)
38
39 #if _AST_REGEX_DEBUG
40
41 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n))
42 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0)
43 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0)
44
45 static unsigned long debug;
46 static unsigned long debug_flag;
47
48 #else
49
50 #define DEBUG_INIT()
51 #define DEBUG_TEST(f,y,n) (n)
52 #define DEBUG_CODE(f,y,n) do {n} while(0)
53
54 #endif
55
56 #if _PACKAGE_ast
57
58 typedef struct Cchr_s
59 {
60 Dtlink_t lnk;
61 unsigned char nam[2];
62 Ckey_t key;
63 } Cchr_t;
64
65 #endif
66
67 #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0)
68
69 /*
70 * determine whether greedy matching will work, i.e. produce
71 * the best match first. such expressions are "easy", and
72 * need no backtracking once a complete match is found.
73 * if an expression has backreferences or alts it's hard
74 * else if it has only one closure it's easy
75 * else if all closures are simple (i.e. one-character) it's easy
76 * else it's hard.
77 */
78
79 typedef struct Stats_s
80 {
81 unsigned long l; /* min length to left of x */
82 unsigned long k; /* min length to left of y */
83 unsigned long m; /* min length */
84 unsigned long n; /* max length */
85 unsigned short a; /* number of alternations */
86 unsigned short b; /* number of backrefs */
87 unsigned short c; /* number of closures */
88 unsigned short e; /* $ */
89 unsigned short i; /* number of negations */
90 unsigned short p; /* number of named subexpressions */
91 unsigned short s; /* number of simple closures */
92 unsigned short t; /* number of tries */
93 unsigned short u; /* number of unnamed subexpressions */
94 Rex_t* x; /* max length REX_STRING */
95 Rex_t* y; /* max length REX_TRIE */
96 } Stats_t;
97
98 typedef struct Token_s
99 {
100 unsigned long min;
101 unsigned long max;
102 short lex;
103 short len;
104 short esc;
105 short att;
106 short push;
107 } Token_t;
108
109 typedef struct Cenv_s
110 {
111 int delimiter; /* pattern delimiter */
112 int error; /* last error */
113 int explicit; /* explicit match on this char */
114 int mappeddot; /* inverse mapped '.' */
115 int mappednewline; /* inverse mapped '\n' */
116 int mappedslash; /* inverse mapped '/' */
117 regflags_t flags; /* flags arg to regcomp */
118 int type; /* BRE,ERE,ARE,SRE,KRE */
119 unsigned char* cursor; /* curent point in re */
120 unsigned char* pattern; /* the original pattern */
121 unsigned char* literal; /* literal restart pattern */
122 int parno; /* number of last open paren */
123 int parnest; /* paren nest count */
124 int posixkludge; /* to make * nonspecial */
125 Token_t token; /* token lookahead */
126 Stats_t stats; /* RE statistics */
127 int terminator; /* pattern terminator */
128 Rex_t* paren[2*(BACK_REF_MAX+2)];
129 /* paren[i]!=0 if \i defined */
130 regex_t* regex; /* user handle */
131 regdisc_t* disc; /* user discipline */
132 unsigned char* map; /* external to native ccode map */
133 unsigned char* MAP; /* fold and/or map */
134 } Cenv_t;
135
136 /*
137 * allocate a new Rex_t node
138 */
139
140 #if _BLD_DEBUG
141 #define node(a,b,c,d,e) node(a,b,c,d,e, const char* file, unsigned int line)
142 #endif
143
144 static Rex_t*
node(Cenv_t * env,int type,int lo,int hi,size_t extra)145 node(Cenv_t* env, int type, int lo, int hi, size_t extra)
146 {
147 register Rex_t* e;
148
149 DEBUG_TEST(0x0800,(sfprintf(sfstdout, "%s:%d node(%d,%d,%d,%u)\n", file, line, type, lo, hi, sizeof(Rex_t) + extra)),(0));
150 if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra))
151 {
152 memset(e, 0, sizeof(Rex_t) + extra);
153 e->type = type;
154 e->marked = 0;
155 e->lo = lo;
156 e->hi = hi;
157 e->flags = env->flags;
158 e->map = (e->flags & REG_ICASE) ? env->MAP : env->map;
159 e->explicit = env->explicit;
160 if (extra)
161 e->re.data = (char*)e + sizeof(Rex_t);
162 }
163 return e;
164 }
165
166 #if _BLD_DEBUG
167 #undef node
168 #define node(a,b,c,d,e) node(a,b,c,d,e,__FILE__,__LINE__)
169 #endif
170
171 /*
172 * free a Trie_node_t node
173 */
174
175 static void
triedrop(regdisc_t * disc,Trie_node_t * e)176 triedrop(regdisc_t* disc, Trie_node_t* e)
177 {
178 if (e)
179 {
180 triedrop(disc, e->son);
181 triedrop(disc, e->sib);
182 alloc(disc, e, 0);
183 }
184 }
185
186 /*
187 * free a Rex_t node
188 */
189
190 void
drop(regdisc_t * disc,Rex_t * e)191 drop(regdisc_t* disc, Rex_t* e)
192 {
193 int i;
194 Rex_t* x;
195
196 if (e && !(disc->re_flags & REG_NOFREE))
197 do
198 {
199 switch (e->type)
200 {
201 case REX_ALT:
202 case REX_CONJ:
203 drop(disc, e->re.group.expr.binary.left);
204 drop(disc, e->re.group.expr.binary.right);
205 break;
206 case REX_GROUP:
207 case REX_GROUP_AHEAD:
208 case REX_GROUP_AHEAD_NOT:
209 case REX_GROUP_BEHIND:
210 case REX_GROUP_BEHIND_NOT:
211 case REX_GROUP_CUT:
212 case REX_NEG:
213 case REX_REP:
214 drop(disc, e->re.group.expr.rex);
215 break;
216 case REX_TRIE:
217 for (i = 0; i <= UCHAR_MAX; i++)
218 triedrop(disc, e->re.trie.root[i]);
219 break;
220 }
221 x = e->next;
222 alloc(disc, e, 0);
223 } while (e = x);
224 }
225
226 /*
227 * mark e and descendants minimal
228 */
229
230 static void
mark(register Rex_t * e,int set)231 mark(register Rex_t* e, int set)
232 {
233 if (e && !e->marked)
234 do
235 {
236 e->marked = 1;
237 if (set)
238 e->flags |= REG_MINIMAL;
239 else
240 e->flags &= ~REG_MINIMAL;
241 switch (e->type)
242 {
243 case REX_ALT:
244 case REX_CONJ:
245 case REX_GROUP_COND:
246 if (e->re.group.expr.binary.left)
247 mark(e->re.group.expr.binary.left, set);
248 if (e->re.group.expr.binary.right)
249 mark(e->re.group.expr.binary.right, set);
250 break;
251 case REX_GROUP:
252 case REX_GROUP_AHEAD:
253 case REX_GROUP_AHEAD_NOT:
254 case REX_GROUP_BEHIND:
255 case REX_GROUP_BEHIND_NOT:
256 case REX_GROUP_CUT:
257 case REX_NEG:
258 case REX_REP:
259 case REX_TRIE:
260 mark(e->re.group.expr.rex, set);
261 break;
262 }
263 } while (e = e->next);
264 }
265
266 /*
267 * assign subexpression numbers by a preorder tree walk
268 */
269
270 static int
serialize(Cenv_t * env,Rex_t * e,int n)271 serialize(Cenv_t* env, Rex_t* e, int n)
272 {
273 do
274 {
275 e->serial = n++;
276 switch (e->type)
277 {
278 case REX_ALT:
279 case REX_GROUP_COND:
280 if (e->re.group.expr.binary.left)
281 n = serialize(env, e->re.group.expr.binary.left, n);
282 e->re.group.expr.binary.serial = n++;
283 if (e->re.group.expr.binary.right)
284 n = serialize(env, e->re.group.expr.binary.right, n);
285 break;
286 case REX_CONJ:
287 n = serialize(env, e->re.group.expr.binary.left, n);
288 n = serialize(env, e->re.group.expr.binary.right, n);
289 break;
290 case REX_GROUP:
291 case REX_GROUP_AHEAD:
292 case REX_GROUP_AHEAD_NOT:
293 case REX_GROUP_BEHIND:
294 case REX_GROUP_BEHIND_NOT:
295 case REX_GROUP_CUT:
296 case REX_NEG:
297 case REX_REP:
298 n = serialize(env, e->re.group.expr.rex, n);
299 break;
300 }
301 } while (e = e->next);
302 return n;
303 }
304
305 /*
306 * catenate e and f into a sequence, collapsing them if possible
307 */
308
309 static Rex_t*
cat(Cenv_t * env,Rex_t * e,Rex_t * f)310 cat(Cenv_t* env, Rex_t* e, Rex_t* f)
311 {
312 Rex_t* g;
313
314 if (!f)
315 {
316 drop(env->disc, e);
317 return 0;
318 }
319 if (e->type == REX_NULL)
320 {
321 drop(env->disc, e);
322 return f;
323 }
324 if (f->type == REX_NULL)
325 {
326 g = f->next;
327 f->next = 0;
328 drop(env->disc, f);
329 f = g;
330 }
331 else if (e->type == REX_DOT && f->type == REX_DOT)
332 {
333 unsigned int m = e->lo + f->lo;
334 unsigned int n = e->hi + f->hi;
335
336 if (m <= RE_DUP_MAX)
337 {
338 if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX)
339 {
340 n = RE_DUP_INF;
341 goto combine;
342 }
343 else if (n <= RE_DUP_MAX)
344 {
345 combine:
346 e->lo = m;
347 e->hi = n;
348 g = f->next;
349 f->next = 0;
350 drop(env->disc, f);
351 f = g;
352 }
353 }
354 }
355 e->next = f;
356 return e;
357 }
358
359 /*
360 * collect re statistics
361 */
362
363 static int
stats(register Cenv_t * env,register Rex_t * e)364 stats(register Cenv_t* env, register Rex_t* e)
365 {
366 register unsigned long n;
367 register unsigned long m;
368 unsigned long cm;
369 unsigned long nm;
370 unsigned long cn;
371 unsigned long nn;
372 unsigned long l;
373 unsigned long k;
374 unsigned long t;
375 Rex_t* q;
376 Rex_t* x;
377 Rex_t* y;
378 unsigned char c;
379 unsigned char b;
380
381 do
382 {
383 switch (e->type)
384 {
385 case REX_ALT:
386 x = env->stats.x;
387 l = env->stats.l;
388 y = env->stats.y;
389 k = env->stats.k;
390 t = env->stats.t;
391 if (++env->stats.a <= 0)
392 return 1;
393 cm = env->stats.m;
394 env->stats.m = 0;
395 cn = env->stats.n;
396 env->stats.n = 0;
397 if (stats(env, e->re.group.expr.binary.left))
398 return 1;
399 m = env->stats.m;
400 env->stats.m = 0;
401 n = env->stats.n;
402 env->stats.n = 0;
403 if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right))
404 return 1;
405 if (env->stats.m > m)
406 env->stats.m = m;
407 else
408 m = env->stats.m;
409 if ((env->stats.m += cm) < m)
410 return 1;
411 if (env->stats.n < n)
412 env->stats.n = n;
413 else
414 n = env->stats.n;
415 if ((env->stats.n += cn) < n)
416 return 1;
417 env->stats.x = x;
418 env->stats.l = l;
419 env->stats.y = y;
420 env->stats.k = k;
421 env->stats.t = t;
422 break;
423 case REX_BACK:
424 if (++env->stats.b <= 0)
425 return 1;
426 break;
427 case REX_CLASS:
428 case REX_COLL_CLASS:
429 case REX_DOT:
430 case REX_ONECHAR:
431 n = env->stats.m;
432 if ((env->stats.m += e->lo) < n)
433 return 1;
434 if (e->hi != RE_DUP_INF)
435 {
436 n = env->stats.n;
437 if ((env->stats.n += e->hi) < n)
438 return 1;
439 }
440 if (e->lo != e->hi)
441 {
442 if (++env->stats.c <= 0)
443 return 1;
444 if (++env->stats.s <= 0)
445 return 1;
446 }
447 break;
448 case REX_CONJ:
449 cm = env->stats.m;
450 env->stats.m = 0;
451 cn = env->stats.n;
452 env->stats.n = 0;
453 if (stats(env, e->re.group.expr.binary.left))
454 return 1;
455 nm = env->stats.m;
456 env->stats.m = 0;
457 nn = env->stats.n;
458 env->stats.n = 0;
459 if (stats(env, e->re.group.expr.binary.right))
460 return 1;
461 if (env->stats.m < nm)
462 env->stats.m = nm;
463 else
464 nm = env->stats.m;
465 if ((env->stats.m += cm) < nm)
466 return 1;
467 if (env->stats.n < nn)
468 env->stats.n = nn;
469 else
470 nn = env->stats.n;
471 if ((env->stats.n += cn) < nn)
472 return 1;
473 break;
474 case REX_END:
475 env->stats.e = 1;
476 break;
477 case REX_GROUP:
478 if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0)
479 return 1;
480 if (stats(env, e->re.group.expr.rex))
481 return 1;
482 break;
483 case REX_GROUP_AHEAD:
484 case REX_GROUP_AHEAD_NOT:
485 case REX_GROUP_BEHIND:
486 case REX_GROUP_BEHIND_NOT:
487 m = env->stats.m;
488 n = env->stats.n;
489 x = env->stats.x;
490 y = env->stats.y;
491 if (stats(env, e->re.group.expr.rex))
492 return 1;
493 env->stats.m = m;
494 env->stats.n = n;
495 env->stats.x = x;
496 env->stats.y = y;
497 switch (e->type)
498 {
499 case REX_GROUP_AHEAD:
500 case REX_GROUP_BEHIND:
501 if (++env->stats.u <= 0)
502 return 1;
503 break;
504 }
505 break;
506 case REX_GROUP_COND:
507 if (++env->stats.u <= 0)
508 return 1;
509 m = env->stats.m;
510 n = env->stats.n;
511 x = env->stats.x;
512 y = env->stats.y;
513 if (e->re.group.size > 0 && ++env->stats.b <= 0)
514 return 1;
515 if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left))
516 return 1;
517 if (q = e->re.group.expr.binary.right)
518 {
519 if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left))
520 return 1;
521 if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right))
522 return 1;
523 }
524 env->stats.m = m;
525 env->stats.n = n;
526 env->stats.x = x;
527 env->stats.y = y;
528 break;
529 case REX_GROUP_CUT:
530 if (++env->stats.u <= 0)
531 return 1;
532 m = env->stats.m;
533 n = env->stats.n;
534 x = env->stats.x;
535 y = env->stats.y;
536 if (stats(env, e->re.group.expr.rex))
537 return 1;
538 env->stats.m = m;
539 env->stats.n = n;
540 env->stats.x = x;
541 env->stats.y = y;
542 break;
543 case REX_NEG:
544 env->stats.i++;
545 x = env->stats.x;
546 l = env->stats.l;
547 y = env->stats.y;
548 k = env->stats.k;
549 t = env->stats.t;
550 cm = env->stats.m;
551 env->stats.m = 0;
552 if (stats(env, e->re.group.expr.rex))
553 return 1;
554 env->stats.m = !env->stats.m;
555 if ((env->stats.m += cm) < cm)
556 return 1;
557 env->stats.x = x;
558 env->stats.l = l;
559 env->stats.y = y;
560 env->stats.k = k;
561 env->stats.t = t;
562 break;
563 case REX_REP:
564 x = env->stats.x;
565 l = env->stats.l;
566 y = env->stats.y;
567 k = env->stats.k;
568 t = env->stats.t;
569 if (++env->stats.c <= 0)
570 return 1;
571 b = env->stats.b;
572 c = env->stats.c;
573 cm = env->stats.m;
574 env->stats.m = 0;
575 if (stats(env, e->re.group.expr.rex))
576 return 1;
577 if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0)
578 return 1;
579 if (e->lo < 1)
580 {
581 env->stats.x = x;
582 env->stats.l = l;
583 env->stats.y = y;
584 env->stats.k = k;
585 env->stats.t = t;
586 env->stats.m = cm;
587 }
588 else
589 {
590 m = env->stats.m;
591 if ((env->stats.m *= e->lo) > 0 && env->stats.m < m)
592 return 1;
593 m = env->stats.m;
594 if ((env->stats.m += cm) < m)
595 return 1;
596 if (env->stats.x != x)
597 env->stats.l = cm;
598 if (env->stats.y != y)
599 env->stats.k = cm;
600 }
601 break;
602 case REX_STRING:
603 if (!e->map)
604 {
605 cm = env->stats.m;
606 if ((env->stats.m += e->re.string.size) < cm)
607 return 1;
608 cn = env->stats.n;
609 if ((env->stats.n += e->re.string.size) < cn)
610 return 1;
611 if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size)
612 {
613 env->stats.x = e;
614 env->stats.l = cm;
615 }
616 }
617 break;
618 case REX_TRIE:
619 if (++env->stats.s <= 0)
620 return 1;
621 cm = env->stats.m;
622 if ((env->stats.m += e->re.trie.min) < cm)
623 return 1;
624 cn = env->stats.n;
625 if ((env->stats.n += e->re.trie.max) < cn)
626 return 1;
627 env->stats.t++;
628 if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min)
629 {
630 env->stats.y = e;
631 env->stats.k = cm;
632 }
633 break;
634 }
635 } while (e = e->next);
636 return 0;
637 }
638
639 static int token(Cenv_t*);
640
641 static int
magic(register Cenv_t * env,register int c,int escaped)642 magic(register Cenv_t* env, register int c, int escaped)
643 {
644 register char* sp;
645 register int n;
646 int o = c;
647 int e = env->error;
648 int l = env->token.len;
649 short* mp;
650 char* ep;
651
652 if (mp = state.magic[c])
653 {
654 c = mp[env->type+escaped];
655 if (c >= T_META)
656 {
657 sp = (char*)env->cursor + env->token.len;
658 switch (c)
659 {
660 case T_LEFT:
661 n = 0;
662 ep = sp;
663 while (*sp >= '0' && *sp <= '9')
664 {
665 if (n > (INT_MAX / 10))
666 {
667 env->error = REG_BADBR;
668 goto bad;
669 }
670 n = n * 10 + *sp++ - '0';
671 }
672 if (sp == ep)
673 {
674 if (env->type < SRE || *sp != ',')
675 {
676 env->error = *sp ? REG_BADBR : REG_EBRACE;
677 goto bad;
678 }
679 }
680 else if (n > RE_DUP_MAX)
681 {
682 env->error = REG_BADBR;
683 goto bad;
684 }
685 env->token.min = n;
686 if (*sp == ',')
687 {
688 n = 0;
689 ep = ++sp;
690 while (*sp >= '0' && *sp <= '9')
691 {
692 if (n > (INT_MAX / 10))
693 {
694 env->error = REG_BADBR;
695 goto bad;
696 }
697 n = n * 10 + *sp++ - '0';
698 }
699 if (sp == ep)
700 n = RE_DUP_INF;
701 else if (n < env->token.min)
702 {
703 env->error = REG_BADBR;
704 goto bad;
705 }
706 }
707 env->token.max = n;
708 switch (*sp)
709 {
710 case 0:
711 env->error = REG_EBRACE;
712 goto bad;
713 case '\\':
714 if (!escaped)
715 {
716 env->error = REG_BADBR;
717 goto bad;
718 }
719 sp++;
720 break;
721 default:
722 if (escaped)
723 {
724 env->error = REG_BADBR;
725 goto bad;
726 }
727 break;
728 }
729 switch (*sp++)
730 {
731 case 0:
732 env->error = REG_EBRACE;
733 goto bad;
734 case '}':
735 break;
736 default:
737 env->error = REG_BADBR;
738 goto bad;
739 }
740 env->token.len = sp - (char*)env->cursor;
741 break;
742 case T_RIGHT:
743 env->error = REG_EBRACE;
744 goto bad;
745 case T_OPEN:
746 if (env->type < SRE && *sp == '?')
747 {
748 env->token.len++;
749 env->token.lex = 0;
750 goto group;
751 }
752 break;
753 case T_ESCAPE:
754 c = chresc(sp - 2, &ep);
755 if (ep < sp)
756 goto bad;
757 env->token.len += ep - sp;
758 if (c >= T_META)
759 {
760 env->token.lex = c;
761 c = C_ESC;
762 }
763 return c;
764 case T_BACK+0:
765 case T_BACK+1:
766 case T_BACK+2:
767 case T_BACK+3:
768 case T_BACK+4:
769 case T_BACK+5:
770 case T_BACK+6:
771 case T_BACK+7:
772 n = chresc(sp - 2, &ep);
773 if (ep > sp + 1)
774 {
775 env->token.len += ep - sp;
776 return n;
777 }
778 /*FALLTHROUGH*/
779 case T_BACK+8:
780 case T_BACK+9:
781 if (env->type == SRE || c == T_BACK && !(env->flags & (REG_LENIENT|REG_REGEXP)))
782 {
783 env->error = REG_BADESC;
784 goto bad;
785 }
786 if ((env->flags & REG_MULTIREF) && isdigit(*sp))
787 {
788 c = (c - T_BACK) * 10 + (*sp - '0');
789 if (c > 0 && c <= env->parno && env->paren[c])
790 c += T_BACK;
791 else
792 c = chresc(sp - 2, &ep);
793 env->token.len++;
794 }
795 if (c == T_BACK)
796 c = 0;
797 break;
798 case T_BAD:
799 if (escaped == 1 && (env->flags & (REG_LENIENT|REG_REGEXP)) && (c = mp[env->type+escaped+2]) >= T_META)
800 return c;
801 goto bad;
802 }
803 if (env->type >= SRE)
804 {
805 if (c == T_DOT)
806 c = '.';
807 else if (c < T_OPEN)
808 {
809 if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(')
810 {
811 env->token.len++;
812 env->token.att = 1;
813 }
814 if (env->type == KRE && *(env->cursor + env->token.len) == '(')
815 {
816 env->token.len++;
817 switch (c)
818 {
819 case T_AT:
820 break;
821 case T_PERCENT:
822 env->token.lex = c;
823 goto group;
824 case T_TILDE:
825 env->token.lex = 0;
826 goto group;
827 default:
828 env->token.lex = c;
829 break;
830 }
831 c = T_OPEN;
832 }
833 else if (c == T_STAR)
834 c = T_DOTSTAR;
835 else if (c == T_QUES)
836 c = T_DOT;
837 else
838 {
839 c = o;
840 env->token.len = l;
841 }
842 }
843 else if (c > T_BACK)
844 {
845 c = (c - T_BACK) * 2 - 1;
846 c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c;
847 }
848 else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP))
849 {
850 if (c == T_AND)
851 c = '&';
852 else if (c == T_BAR)
853 c = '|';
854 else if (c == T_OPEN)
855 c = '(';
856 }
857 }
858 }
859 }
860 else if (escaped == 2)
861 {
862 if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter))
863 /*ok*/;
864 else
865 {
866 env->error = REG_BADESC;
867 goto bad;
868 }
869 }
870 else if (escaped && !(env->flags & (REG_LENIENT|REG_REGEXP)) && c != ']')
871 {
872 env->error = REG_BADESC;
873 goto bad;
874 }
875 return c;
876 group:
877 sp = (char*)env->cursor + env->token.len;
878 switch (*sp++)
879 {
880 case ')':
881 break;
882 case '#':
883 for (;;)
884 {
885 switch (*sp++)
886 {
887 case 0:
888 env->error = REG_EPAREN;
889 return T_BAD;
890 case ')':
891 break;
892 default:
893 continue;
894 }
895 break;
896 }
897 break;
898 default:
899 return T_GROUP;
900 }
901 env->cursor = (unsigned char*)sp;
902 return token(env);
903 bad:
904 if (escaped == 2)
905 env->error = e;
906 else if (env->flags & (REG_LENIENT|REG_REGEXP))
907 return o;
908 else if (escaped == 1 && !env->error)
909 {
910 if (mp || o == ']')
911 return o;
912 env->error = REG_BADESC;
913 }
914 return T_BAD;
915 }
916
917 static int
token(register Cenv_t * env)918 token(register Cenv_t* env)
919 {
920 int c;
921 int posixkludge;
922
923 if (env->token.push)
924 return env->token.lex;
925 env->token.att = env->token.esc = 0;
926 if ((env->token.len = MBSIZE(env->cursor)) > 1)
927 return env->token.lex = C_MB;
928 env->token.lex = 0;
929 for (;;)
930 {
931 c = *env->cursor;
932 if (c == 0 || c == env->delimiter || c == env->terminator)
933 return T_END;
934 if (!(env->flags & REG_COMMENT))
935 break;
936 if (c == '#')
937 {
938 do
939 {
940 c = *++env->cursor;
941 if (c == 0 || c == env->delimiter)
942 return T_END;
943 } while (c != '\n');
944 }
945 else if (!isspace(c))
946 break;
947 env->cursor++;
948 }
949 if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter)
950 {
951 if (env->parnest)
952 {
953 env->error = REG_EPAREN;
954 return T_BAD;
955 }
956 env->parno = 0;
957 env->pattern = env->cursor + 1;
958 return T_BAR;
959 }
960 if (env->flags & REG_LITERAL)
961 return c;
962 if (posixkludge = env->posixkludge)
963 {
964 env->posixkludge = 0;
965 if (c == '*')
966 return c;
967 }
968 if (c == '\\')
969 {
970 if (env->flags & REG_SHELL_ESCAPED)
971 return c;
972 if (!(c = *(env->cursor + 1)) || c == env->terminator)
973 {
974 if (env->flags & (REG_LENIENT|REG_REGEXP))
975 {
976 if (c)
977 {
978 env->token.esc = env->token.len;
979 env->token.len += MBSIZE(env->cursor + 1);
980 return c;
981 }
982 return '\\';
983 }
984 env->error = REG_EESCAPE;
985 return T_BAD;
986 }
987 env->token.esc = env->token.len;
988 env->token.len += MBSIZE(env->cursor + 1);
989 if (env->delimiter && c == 'n')
990 return '\n';
991 else if (c == env->delimiter)
992 return magic(env, c, 0);
993 else if (c == '(' && env->type == BRE)
994 env->posixkludge = 1;
995 else if (c == ')' && env->type == BRE && env->parnest <= 0)
996 {
997 env->error = REG_EPAREN;
998 return T_BAD;
999 }
1000 else if (isspace(c) && (env->flags & REG_COMMENT))
1001 return c;
1002 return magic(env, c, 1);
1003 }
1004 else if (c == '$')
1005 {
1006 if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n')
1007 return T_DOLL;
1008 }
1009 else if (c == '^')
1010 {
1011 if (env->type == BRE && (env->cursor == env->pattern || posixkludge == 1))
1012 {
1013 env->posixkludge = 2;
1014 return T_CFLX;
1015 }
1016 }
1017 else if (c == ')')
1018 {
1019 if (env->type != BRE && env->parnest <= 0)
1020 return c;
1021 }
1022 else if (c == '/' && env->explicit == env->mappedslash)
1023 {
1024 while (*(env->cursor + env->token.len) == c)
1025 env->token.len++;
1026 return T_SLASHPLUS;
1027 }
1028 return magic(env, c, 0);
1029 }
1030
1031 static Celt_t*
col(Celt_t * ce,int ic,unsigned char * bp,int bw,int bc,unsigned char * ep,int ew,int ec)1032 col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec)
1033 {
1034 register char* s;
1035 register unsigned char* k;
1036 register unsigned char* e;
1037 register int c;
1038 register int cc;
1039 int bt;
1040 int et;
1041 Ckey_t key;
1042
1043 cc = 0;
1044 for (;;)
1045 {
1046 k = key;
1047 if (bw == 1)
1048 {
1049 c = bc;
1050 if (ic)
1051 {
1052 if (isupper(c))
1053 {
1054 c = tolower(c);
1055 cc = -1;
1056 }
1057 else if (islower(c))
1058 {
1059 c = toupper(c);
1060 cc = -1;
1061 }
1062 }
1063 *k++ = c;
1064 }
1065 else if (bw < COLL_KEY_MAX)
1066 {
1067 s = (char*)bp;
1068 if (ic)
1069 {
1070 c = mbchar(s);
1071 if (iswupper(c))
1072 {
1073 c = towlower(c);
1074 cc = 1;
1075 }
1076 else if (iswlower(c))
1077 {
1078 c = towupper(c);
1079 cc = 1;
1080 }
1081 }
1082 if (cc > 0)
1083 {
1084 cc = -1;
1085 k += mbconv((char*)k, c);
1086 }
1087 else
1088 for (e = k + bw; k < e; *k++ = *s++);
1089 }
1090 *k = 0;
1091 mbxfrm(ce->beg, key, COLL_KEY_MAX);
1092 if (ep)
1093 {
1094 k = key;
1095 c = mbchar(k);
1096 if (iswupper(c))
1097 bt = COLL_range_uc;
1098 else if (iswlower(c))
1099 bt = COLL_range_lc;
1100 else
1101 bt = COLL_range;
1102 k = key;
1103 if (ew == 1)
1104 {
1105 c = ec;
1106 if (ic)
1107 {
1108 if (isupper(c))
1109 {
1110 c = tolower(c);
1111 cc = -1;
1112 }
1113 else if (islower(c))
1114 {
1115 c = toupper(c);
1116 cc = -1;
1117 }
1118 }
1119 *k++ = c;
1120 }
1121 else if (ew < COLL_KEY_MAX)
1122 {
1123 s = (char*)ep;
1124 if (ic)
1125 {
1126 c = mbchar(s);
1127 if (iswupper(c))
1128 {
1129 c = towlower(c);
1130 cc = 1;
1131 }
1132 else if (iswlower(c))
1133 {
1134 c = towupper(c);
1135 cc = 1;
1136 }
1137 }
1138 if (cc > 0)
1139 {
1140 cc = -1;
1141 k += mbconv((char*)k, c);
1142 }
1143 else
1144 for (e = k + ew; k < e; *k++ = *s++);
1145 }
1146 *k = 0;
1147 mbxfrm(ce->end, key, COLL_KEY_MAX);
1148 k = key;
1149 c = mbchar(k);
1150 if (iswupper(c))
1151 et = COLL_range_uc;
1152 else if (iswlower(c))
1153 et = COLL_range_lc;
1154 else
1155 et = COLL_range;
1156 ce->typ = bt == et ? bt : COLL_range;
1157 }
1158 else
1159 ce->typ = COLL_char;
1160 ce++;
1161 if (!ic || !cc)
1162 break;
1163 ic = 0;
1164 }
1165 return ce;
1166 }
1167
1168 static Rex_t*
bra(Cenv_t * env)1169 bra(Cenv_t* env)
1170 {
1171 Rex_t* e;
1172 int c;
1173 int i;
1174 int w;
1175 int neg;
1176 int last;
1177 int inrange;
1178 int complicated;
1179 int collate;
1180 int elements;
1181 unsigned char* first;
1182 unsigned char* start;
1183 unsigned char* begin;
1184 unsigned char* s;
1185 regclass_t f;
1186 unsigned char buf[4 * (COLL_KEY_MAX + 1)];
1187 #if _PACKAGE_ast
1188 int ic;
1189 char mbc[COLL_KEY_MAX + 1];
1190 #endif
1191
1192 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
1193 return 0;
1194 collate = complicated = elements = 0;
1195 if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!')
1196 {
1197 env->cursor++;
1198 complicated = neg = 1;
1199 }
1200 else
1201 neg = 0;
1202 first = env->cursor;
1203 start = first + MBSIZE(first);
1204 if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter))
1205 goto error;
1206 begin = env->cursor + MBSIZE(env->cursor);
1207
1208 /*
1209 * inrange: 0=no, 1=possibly, 2=definitely
1210 */
1211
1212 inrange = 0;
1213 for (;;)
1214 {
1215 if (!(c = *env->cursor) || c == env->terminator || c == env->delimiter && (env->flags & REG_ESCAPE))
1216 goto error;
1217 env->cursor += (w = MBSIZE(env->cursor));
1218 if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
1219 {
1220 if (*env->cursor)
1221 {
1222 if (*env->cursor == 'n')
1223 {
1224 env->cursor++;
1225 c = '\n';
1226 }
1227 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
1228 {
1229 env->token.len = 1;
1230 w = magic(env, *env->cursor, 2);
1231 if (env->token.len > 1 || w != T_BAD)
1232 {
1233 if (env->token.len == 1 && (f = classfun(w)))
1234 {
1235 if (inrange > 1)
1236 {
1237 if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1238 goto erange;
1239 inrange = 0;
1240 }
1241 env->cursor++;
1242 for (c = 0; c <= UCHAR_MAX; c++)
1243 if ((*f)(c))
1244 setadd(e->re.charclass, c);
1245 complicated++;
1246 elements++;
1247 continue;
1248 }
1249 if (env->token.len > 1 || w >= 0 && w < T_META)
1250 {
1251 c = w;
1252 if (c > UCHAR_MAX)
1253 {
1254 if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)) && !mbwide())
1255 goto erange;
1256 c = UCHAR_MAX;
1257 }
1258 env->cursor += env->token.len;
1259 }
1260 }
1261 }
1262 }
1263 }
1264 else if (c == ']')
1265 {
1266 if (env->cursor == begin)
1267 {
1268 last = c;
1269 inrange = 1;
1270 continue;
1271 }
1272 if (inrange != 0)
1273 {
1274 setadd(e->re.charclass, last);
1275 elements++;
1276 if (inrange == 2)
1277 {
1278 setadd(e->re.charclass, '-');
1279 elements++;
1280 }
1281 }
1282 break;
1283 }
1284 else if (c == '-')
1285 {
1286 if (!inrange && env->cursor != begin && *env->cursor != ']')
1287 {
1288 if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1289 goto erange;
1290 continue;
1291 }
1292 else if (inrange == 1)
1293 {
1294 inrange = 2;
1295 complicated++;
1296 continue;
1297 }
1298 }
1299 else if (c == '[')
1300 {
1301 switch (*env->cursor)
1302 {
1303 case 0:
1304 goto error;
1305 case ':':
1306 if (env->flags & REG_REGEXP)
1307 goto normal;
1308 if (inrange == 1)
1309 {
1310 setadd(e->re.charclass, last);
1311 elements++;
1312 }
1313 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
1314 {
1315 if (env->cursor == start && (c = *(env->cursor + 1)))
1316 {
1317 s = start = env->cursor + 1;
1318 while (*++s && *s != ':');
1319 if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']')
1320 {
1321 if ((i = (s - start)) == 1)
1322 {
1323 switch (c)
1324 {
1325 case '<':
1326 i = REX_WBEG;
1327 break;
1328 case '>':
1329 i = REX_WEND;
1330 break;
1331 default:
1332 i = 0;
1333 break;
1334 }
1335 if (i)
1336 {
1337 env->cursor = s + 3;
1338 drop(env->disc, e);
1339 return node(env, i, 0, 0, 0);
1340 }
1341 }
1342 }
1343 }
1344 env->error = REG_ECTYPE;
1345 goto error;
1346 }
1347 for (c = 0; c <= UCHAR_MAX; c++)
1348 if ((*f)(c))
1349 setadd(e->re.charclass, c);
1350 inrange = 0;
1351 complicated++;
1352 elements++;
1353 continue;
1354 case '=':
1355 if (env->flags & REG_REGEXP)
1356 goto normal;
1357 if (inrange == 2)
1358 goto erange;
1359 if (inrange == 1)
1360 {
1361 setadd(e->re.charclass, last);
1362 elements++;
1363 }
1364 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf), NiL)) < 0)
1365 goto ecollate;
1366 if (c > 1)
1367 collate++;
1368 else
1369 setadd(e->re.charclass, buf[0]);
1370 c = buf[0];
1371 inrange = 0;
1372 complicated++;
1373 elements++;
1374 continue;
1375 case '.':
1376 if (env->flags & REG_REGEXP)
1377 goto normal;
1378 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf), NiL)) < 0)
1379 goto ecollate;
1380 if (c > 1)
1381 collate++;
1382 c = buf[0];
1383 complicated++;
1384 break;
1385 default:
1386 normal:
1387 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
1388 goto error;
1389 break;
1390 }
1391 }
1392 else if (w > 1)
1393 complicated++;
1394 if (inrange == 2)
1395 {
1396 if (last <= c)
1397 {
1398 for (i = last; i <= c; i++)
1399 setadd(e->re.charclass, i);
1400 inrange = env->type >= SRE || (env->flags & (REG_LENIENT|REG_REGEXP));
1401 elements += 2;
1402 }
1403 else if (env->type >= SRE)
1404 {
1405 setadd(e->re.charclass, last);
1406 setadd(e->re.charclass, c);
1407 elements += 2;
1408 inrange = 1;
1409 }
1410 else if (!(env->flags & (REG_LENIENT|REG_REGEXP)))
1411 goto erange;
1412 else
1413 inrange = 0;
1414 }
1415 else if (inrange == 1)
1416 {
1417 setadd(e->re.charclass, last);
1418 elements++;
1419 }
1420 else
1421 inrange = 1;
1422 last = c;
1423 }
1424 #if _PACKAGE_ast
1425 if (complicated && mbcoll())
1426 {
1427 Dt_t* dt;
1428 Cchr_t* cc;
1429 Cchr_t* tc;
1430 Cchr_t* xc;
1431 Celt_t* ce;
1432 Cchr_t key;
1433 int rw;
1434 int rc;
1435 wchar_t wc;
1436 unsigned char* rp;
1437 unsigned char* pp;
1438 char cb[2][COLL_KEY_MAX+1];
1439
1440 static Dtdisc_t disc;
1441
1442 static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
1443
1444 if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data))
1445 {
1446 disc.key = offsetof(Cchr_t, key);
1447 if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dtoset)))
1448 {
1449 for (i = 0; i < elementsof(primary) - 1; i++, cc++)
1450 {
1451 cc->nam[0] = primary[i];
1452 mbxfrm(cc->key, cc->nam, COLL_KEY_MAX);
1453 dtinsert(dt, cc);
1454 }
1455 for (i = 0; i < elementsof(cc->key); i++)
1456 cc->key[i] = ~0;
1457 dtinsert(dt, cc);
1458 LCINFO(AST_LC_COLLATE)->data = (void*)dt;
1459 }
1460 else
1461 {
1462 if (cc)
1463 free(cc);
1464 drop(env->disc, e);
1465 return 0;
1466 }
1467 }
1468 if (dt)
1469 {
1470 drop(env->disc, e);
1471 if (ic = env->flags & REG_ICASE)
1472 elements *= 2;
1473 if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 3) * sizeof(Celt_t))))
1474 return 0;
1475 ce = (Celt_t*)e->re.data;
1476 e->re.collate.invert = neg;
1477 e->re.collate.elements = ce;
1478 env->cursor = first;
1479 inrange = 0;
1480 for (;;)
1481 {
1482 if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter)
1483 goto error;
1484 pp = env->cursor;
1485 env->cursor += (w = MBSIZE(env->cursor));
1486 if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
1487 {
1488 if (*env->cursor)
1489 {
1490 if (*env->cursor == 'n')
1491 {
1492 pp = env->cursor++;
1493 c = '\n';
1494 }
1495 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
1496 {
1497 env->token.len = 1;
1498 w = magic(env, *env->cursor, 2);
1499 if (env->token.len > 1 || w != T_BAD)
1500 {
1501 if (env->token.len == 1 && (f = classfun(w)))
1502 {
1503 if (inrange > 1)
1504 {
1505 if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1506 goto erange;
1507 inrange = 0;
1508 }
1509 env->cursor++;
1510 ce->fun = f;
1511 ce->typ = COLL_call;
1512 ce++;
1513 continue;
1514 }
1515 if (env->token.len > 1 || w >= 0 && w < T_META)
1516 {
1517 c = w;
1518 w = mbconv(mbc, c);
1519 pp = (unsigned char*)mbc;
1520 env->cursor += env->token.len;
1521 }
1522 }
1523 }
1524 }
1525 }
1526 else if (c == ']')
1527 {
1528 if (env->cursor == begin)
1529 {
1530 rp = pp;
1531 rw = w;
1532 inrange = 1;
1533 continue;
1534 }
1535 if (inrange != 0)
1536 {
1537 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1538 if (inrange == 2)
1539 ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0);
1540 }
1541 break;
1542 }
1543 else if (c == '-')
1544 {
1545 if (!inrange && env->cursor != begin && *env->cursor != ']')
1546 {
1547 if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1548 goto erange;
1549 continue;
1550 }
1551 else if (inrange == 1)
1552 {
1553 inrange = 2;
1554 continue;
1555 }
1556 }
1557 else if (c == '[')
1558 {
1559 switch (*env->cursor)
1560 {
1561 case 0:
1562 goto error;
1563 case ':':
1564 if (env->flags & REG_REGEXP)
1565 goto complicated_normal;
1566 if (inrange == 1)
1567 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1568 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
1569 {
1570 if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']')
1571 {
1572 switch (c)
1573 {
1574 case '<':
1575 i = REX_WBEG;
1576 break;
1577 case '>':
1578 i = REX_WEND;
1579 break;
1580 default:
1581 i = 0;
1582 break;
1583 }
1584 if (i)
1585 {
1586 env->cursor += 5;
1587 drop(env->disc, e);
1588 return node(env, i, 0, 0, 0);
1589 }
1590 }
1591 env->error = REG_ECTYPE;
1592 goto error;
1593 }
1594 ce->fun = f;
1595 ce->typ = COLL_call;
1596 ce++;
1597 inrange = 0;
1598 continue;
1599 case '=':
1600 if (env->flags & REG_REGEXP)
1601 goto complicated_normal;
1602 if (inrange == 2)
1603 goto erange;
1604 if (inrange == 1)
1605 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1606 pp = (unsigned char*)cb[inrange];
1607 rp = env->cursor + 1;
1608 if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX, &wc)) < 0)
1609 goto ecollate;
1610 c = 0;
1611 if (ic)
1612 {
1613 if (iswupper(wc))
1614 {
1615 wc = towlower(wc);
1616 rw = mbconv((char*)pp, wc);
1617 c = 'u';
1618 }
1619 else if (iswlower(wc))
1620 c = 'l';
1621 }
1622 i = 1;
1623 for (;;)
1624 {
1625 mbxfrm(key.key, (char*)pp, COLL_KEY_MAX);
1626 if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key)))
1627 {
1628 if (i)
1629 {
1630 c = *pp;
1631 goto singleton;
1632 }
1633 goto ecollate;
1634 }
1635 xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc;
1636 if (c == 'l' || c == 'L' && !(c = 0))
1637 ce->typ = COLL_range_lc;
1638 else if (c == 'u' || c == 'U' && !(c = 0))
1639 ce->typ = COLL_range_uc;
1640 else
1641 ce->typ = COLL_range;
1642 strcpy((char*)ce->beg, (char*)xc->key);
1643 if (!(cc = (Cchr_t*)dtnext(dt, cc)))
1644 {
1645 if (i)
1646 {
1647 c = *pp;
1648 goto singleton;
1649 }
1650 goto ecollate;
1651 }
1652 if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc)))
1653 cc = tc;
1654 strcpy((char*)ce->end, (char*)cc->key);
1655 ce->max = -1;
1656 ce++;
1657 if (!c)
1658 break;
1659 if (c == 'u')
1660 {
1661 wc = towlower(wc);
1662 c = 'L';
1663 }
1664 else
1665 {
1666 wc = towupper(wc);
1667 c = 'U';
1668 }
1669 rw = mbconv((char*)pp, wc);
1670 i = 0;
1671 }
1672 inrange = 0;
1673 c = *pp;
1674 continue;
1675 case '.':
1676 if (env->flags & REG_REGEXP)
1677 goto complicated_normal;
1678 pp = (unsigned char*)cb[inrange];
1679 if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX, NiL)) < 0)
1680 goto ecollate;
1681 c = *pp;
1682 break;
1683 default:
1684 complicated_normal:
1685 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
1686 goto error;
1687 break;
1688 }
1689 }
1690 singleton:
1691 if (inrange == 2)
1692 {
1693 ce = col(ce, ic, rp, rw, rc, pp, w, c);
1694 if (strcmp((char*)ce->beg, (char*)ce->end) > 0)
1695 {
1696 if (env->type < SRE && !(env->flags & (REG_LENIENT|REG_REGEXP)))
1697 goto erange;
1698 (ce-1)->typ = COLL_char;
1699 strcpy((char*)ce->beg, (char*)(ce-1)->end);
1700 ce->typ = COLL_char;
1701 ce++;
1702 }
1703 inrange = env->type >= SRE || (env->flags & (REG_LENIENT|REG_REGEXP));
1704 }
1705 else if (inrange == 1)
1706 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1707 else
1708 inrange = 1;
1709 rp = pp;
1710 rw = w;
1711 rc = c;
1712 }
1713 ce->typ = COLL_end;
1714 return e;
1715 }
1716 }
1717 #endif
1718 if (collate)
1719 goto ecollate;
1720 if (env->flags & REG_ICASE)
1721 for (i = 0; i <= UCHAR_MAX; i++)
1722 if (settst(e->re.charclass, i))
1723 {
1724 if (isupper(i))
1725 c = tolower(i);
1726 else if (islower(i))
1727 c = toupper(i);
1728 else
1729 continue;
1730 setadd(e->re.charclass, c);
1731 }
1732 if (neg)
1733 {
1734 for (i = 0; i < elementsof(e->re.charclass->bits); i++)
1735 e->re.charclass->bits[i] ^= ~0;
1736 if (env->explicit >= 0)
1737 setclr(e->re.charclass, env->explicit);
1738 }
1739 return e;
1740 ecollate:
1741 env->error = REG_ECOLLATE;
1742 goto error;
1743 erange:
1744 env->error = REG_ERANGE;
1745 error:
1746 drop(env->disc, e);
1747 if (!env->error)
1748 env->error = REG_EBRACK;
1749 return 0;
1750 }
1751
1752 static Rex_t*
ccl(Cenv_t * env,int type)1753 ccl(Cenv_t* env, int type)
1754 {
1755 int i;
1756 Rex_t* e;
1757 Celt_t* ce;
1758 regclass_t f;
1759
1760 if (!(f = classfun(type)))
1761 {
1762 env->error = REG_BADESC;
1763 return 0;
1764 }
1765 if (!mbcoll())
1766 {
1767 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
1768 return 0;
1769 for (i = 0; i <= UCHAR_MAX; i++)
1770 if ((*f)(i))
1771 setadd(e->re.charclass, i);
1772 if (env->explicit >= 0)
1773 setclr(e->re.charclass, env->explicit);
1774 }
1775 else
1776 {
1777 if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t))))
1778 return 0;
1779 ce = (Celt_t*)e->re.data;
1780 e->re.collate.invert = 0;
1781 e->re.collate.elements = ce;
1782 ce->fun = f;
1783 ce->typ = COLL_call;
1784 ce++;
1785 ce->typ = COLL_end;
1786 }
1787 return e;
1788 }
1789
1790 static Rex_t*
rep(Cenv_t * env,Rex_t * e,int number,int last)1791 rep(Cenv_t* env, Rex_t* e, int number, int last)
1792 {
1793 Rex_t* f;
1794 unsigned long m = 0;
1795 unsigned long n = RE_DUP_INF;
1796 int minimal = -1;
1797
1798 if (!e)
1799 return 0;
1800 switch (token(env))
1801 {
1802 case T_BANG:
1803 eat(env);
1804 if (!(f = node(env, REX_NEG, m, n, 0)))
1805 {
1806 drop(env->disc, e);
1807 return 0;
1808 }
1809 f->re.group.expr.rex = e;
1810 return f;
1811 case T_QUES:
1812 eat(env);
1813 n = 1;
1814 break;
1815 case T_STAR:
1816 eat(env);
1817 break;
1818 case T_PLUS:
1819 eat(env);
1820 m = 1;
1821 break;
1822 case T_LEFT:
1823 eat(env);
1824 m = env->token.min;
1825 n = env->token.max;
1826 break;
1827 default:
1828 return e;
1829 }
1830 if (env->token.att)
1831 minimal = 1;
1832 else if (env->type < SRE)
1833 switch (token(env))
1834 {
1835 case T_QUES:
1836 eat(env);
1837 minimal = !(env->flags & REG_MINIMAL);
1838 break;
1839 case T_STAR: /*AST*/
1840 eat(env);
1841 minimal = !!(env->flags & REG_MINIMAL);
1842 break;
1843 }
1844 switch (e->type)
1845 {
1846 case REX_DOT:
1847 case REX_CLASS:
1848 case REX_COLL_CLASS:
1849 case REX_ONECHAR:
1850 e->lo = m;
1851 e->hi = n;
1852 if (minimal >= 0)
1853 mark(e, minimal);
1854 return e;
1855 #if HUH_2002_08_07
1856 case REX_BEG:
1857 #endif
1858 case REX_BEG_STR:
1859 case REX_END_STR:
1860 case REX_FIN_STR:
1861 case REX_WBEG:
1862 case REX_WEND:
1863 case REX_WORD:
1864 case REX_WORD_NOT:
1865 env->error = REG_BADRPT;
1866 drop(env->disc, e);
1867 return 0;
1868 }
1869 if (m == 1 && n == 1)
1870 {
1871 if (minimal >= 0)
1872 mark(e, minimal);
1873 return e;
1874 }
1875 if (!(f = node(env, REX_REP, m, n, 0)))
1876 {
1877 drop(env->disc, e);
1878 return 0;
1879 }
1880 f->re.group.expr.rex = e;
1881 f->re.group.number = number;
1882 f->re.group.last = last;
1883 if (minimal >= 0)
1884 mark(f, minimal);
1885 if (m <= n && n)
1886 {
1887 for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex);
1888 if (e && e->type == REX_NEG)
1889 f->type = REX_GROUP;
1890 }
1891 return f;
1892 }
1893
1894 static int
isstring(Rex_t * e)1895 isstring(Rex_t* e)
1896 {
1897 switch (e->type)
1898 {
1899 case REX_ONECHAR:
1900 return e->lo == 1 && e->hi == 1;
1901 case REX_STRING:
1902 return 1;
1903 }
1904 return 0;
1905 }
1906
1907 static Trie_node_t*
trienode(Cenv_t * env,int c)1908 trienode(Cenv_t* env, int c)
1909 {
1910 Trie_node_t* t;
1911
1912 if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t)))
1913 {
1914 memset(t, 0, sizeof(Trie_node_t));
1915 t->c = c;
1916 }
1917 return t;
1918 }
1919
1920 static int
insert(Cenv_t * env,Rex_t * f,Rex_t * g)1921 insert(Cenv_t* env, Rex_t* f, Rex_t* g)
1922 {
1923 unsigned char* s;
1924 unsigned char* e;
1925 Trie_node_t* t;
1926 int len;
1927 unsigned char tmp[2];
1928
1929 switch (f->type)
1930 {
1931 case REX_ONECHAR:
1932 *(s = tmp) = f->re.onechar;
1933 e = s + 1;
1934 break;
1935 case REX_STRING:
1936 s = f->re.string.base;
1937 e = s + f->re.string.size;
1938 break;
1939 default:
1940 return 1;
1941 }
1942 if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s)))
1943 return 1;
1944 for (len = 1;;)
1945 {
1946 if (t->c == *s)
1947 {
1948 if (++s >= e)
1949 break;
1950 if (!t->son && !(t->son = trienode(env, *s)))
1951 return 1;
1952 t = t->son;
1953 len++;
1954 }
1955 else
1956 {
1957 if (!t->sib && !(t->sib = trienode(env, *s)))
1958 return 1;
1959 t = t->sib;
1960 }
1961 }
1962 if (g->re.trie.min > len)
1963 g->re.trie.min = len;
1964 if (g->re.trie.max < len)
1965 g->re.trie.max = len;
1966 t->end = 1;
1967 return 0;
1968 }
1969
1970 /*
1971 * trie() tries to combine nontrivial e and f into a REX_TRIE
1972 * unless 0 is returned, e and f are deleted as far as possible
1973 * also called by regcomb
1974 */
1975
1976 static Rex_t*
trie(Cenv_t * env,Rex_t * e,Rex_t * f)1977 trie(Cenv_t* env, Rex_t* e, Rex_t* f)
1978 {
1979 Rex_t* g;
1980
1981 if (e->next || f->next || !isstring(e) || e->flags != f->flags)
1982 return 0;
1983 if (isstring(f))
1984 {
1985 if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*))))
1986 return 0;
1987 g->re.trie.min = INT_MAX;
1988 if (insert(env, f, g))
1989 goto nospace;
1990 drop(env->disc, f);
1991 }
1992 else if (f->type != REX_TRIE)
1993 return 0;
1994 else
1995 g = f;
1996 if (insert(env, e, g))
1997 goto nospace;
1998 drop(env->disc, e);
1999 return g;
2000 nospace:
2001 if (g != f)
2002 drop(env->disc, g);
2003 return 0;
2004 }
2005
2006 static Rex_t* alt(Cenv_t*, int, int);
2007
2008 static int
chr(register Cenv_t * env,int * escaped)2009 chr(register Cenv_t* env, int* escaped)
2010 {
2011 unsigned char* p;
2012 int c;
2013
2014 *escaped = 0;
2015 if (!(c = *env->cursor))
2016 return -1;
2017 env->cursor++;
2018 if (c == '\\')
2019 {
2020 if (env->flags & REG_SHELL_ESCAPED)
2021 return c;
2022 if (!(c = *(env->cursor + 1)) || c == env->terminator)
2023 {
2024 if (env->flags & (REG_LENIENT|REG_REGEXP))
2025 return c ? c : '\\';
2026 env->error = REG_EESCAPE;
2027 return -1;
2028 }
2029 p = env->cursor;
2030 c = chresc((char*)env->cursor - 1, (char**)&env->cursor);
2031 *escaped = env->cursor - p;
2032 }
2033 return c;
2034 }
2035
2036 /*
2037 * open the perly gates
2038 */
2039
2040 static Rex_t*
grp(Cenv_t * env,int parno)2041 grp(Cenv_t* env, int parno)
2042 {
2043 Rex_t* e;
2044 Rex_t* f;
2045 int c;
2046 int g;
2047 int i;
2048 int n;
2049 int x;
2050 int esc;
2051 int typ;
2052 int beg;
2053 unsigned char* p;
2054
2055 g = env->flags;
2056 beg = env->pattern == env->cursor - env->token.len;
2057 if (!(c = env->token.lex) && (c = *env->cursor))
2058 env->cursor++;
2059 env->token.len = 0;
2060 env->parnest++;
2061 typ = -1;
2062 switch (c)
2063 {
2064 case '-':
2065 case '+':
2066 case 'a':
2067 case 'g':
2068 case 'i':
2069 case 'l':
2070 case 'm':
2071 case 'p':
2072 case 'r':
2073 case 's':
2074 case 'x':
2075 case 'A':
2076 case 'B':
2077 case 'E':
2078 case 'F':
2079 case 'G':
2080 case 'I':
2081 case 'K':
2082 case 'L':
2083 case 'M': /* glob(3) */
2084 case 'N': /* glob(3) */
2085 case 'O': /* glob(3) */
2086 case 'P':
2087 case 'R': /* pcre */
2088 case 'S':
2089 case 'U': /* pcre */
2090 case 'V':
2091 case 'X': /* pcre */
2092 x = REX_GROUP;
2093 i = 1;
2094 env->token.push = 1;
2095 for (;;)
2096 {
2097 switch (c)
2098 {
2099 case ')':
2100 if (!(env->flags & REG_LITERAL))
2101 {
2102 env->error = REG_BADRPT;
2103 return 0;
2104 }
2105 /*FALLTHROUGH*/
2106 case 0:
2107 case T_CLOSE:
2108 x = 0;
2109 goto done;
2110 case ':':
2111 eat(env);
2112 if (token(env) == T_CLOSE)
2113 x = 0;
2114 goto done;
2115 case '-':
2116 i = 0;
2117 break;
2118 case '+':
2119 i = 1;
2120 break;
2121 case 'a':
2122 if (i)
2123 env->flags |= (REG_LEFT|REG_RIGHT);
2124 else
2125 env->flags &= ~(REG_LEFT|REG_RIGHT);
2126 break;
2127 case 'g':
2128 if (i)
2129 env->flags &= ~REG_MINIMAL;
2130 else
2131 env->flags |= REG_MINIMAL;
2132 break;
2133 case 'i':
2134 if (i)
2135 env->flags |= REG_ICASE;
2136 else
2137 env->flags &= ~REG_ICASE;
2138 break;
2139 case 'l':
2140 if (i)
2141 env->flags |= REG_LEFT;
2142 else
2143 env->flags &= ~REG_LEFT;
2144 break;
2145 case 'm':
2146 if (i)
2147 env->flags |= REG_NEWLINE;
2148 else
2149 env->flags &= ~REG_NEWLINE;
2150 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
2151 break;
2152 case 'p':
2153 if (i)
2154 env->flags &= ~REG_LENIENT;
2155 else
2156 env->flags |= REG_LENIENT;
2157 break;
2158 case 'r':
2159 if (i)
2160 env->flags |= REG_RIGHT;
2161 else
2162 env->flags &= ~REG_RIGHT;
2163 break;
2164 case 's':
2165 if (i)
2166 env->flags |= REG_SPAN;
2167 else
2168 env->flags &= ~REG_SPAN;
2169 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
2170 break;
2171 case 'x':
2172 if (i)
2173 env->flags |= REG_COMMENT;
2174 else
2175 env->flags &= ~REG_COMMENT;
2176 break;
2177 case 'X':
2178 if (typ >= 0 || env->type == ERE && (env->flags & REG_CLASS_ESCAPE))
2179 break; /* PCRE_EXTRA */
2180 /*FALLTHROUGH*/
2181 case 'A':
2182 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2183 env->flags |= REG_AUGMENTED|REG_EXTENDED;
2184 typ = ARE;
2185 break;
2186 case 'B':
2187 case 'G':
2188 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2189 typ = BRE;
2190 break;
2191 case 'E':
2192 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2193 env->flags |= REG_EXTENDED;
2194 typ = ERE;
2195 break;
2196 case 'F':
2197 case 'L':
2198 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2199 env->flags |= REG_LITERAL;
2200 typ = ERE;
2201 break;
2202 case 'K':
2203 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2204 env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT;
2205 typ = KRE;
2206 break;
2207 case 'M':
2208 /* used by caller to disable glob(3) GLOB_BRACE */
2209 break;
2210 case 'N':
2211 /* used by caller to disable glob(3) GLOB_NOCHECK */
2212 break;
2213 case 'O':
2214 /* used by caller to disable glob(3) GLOB_STARSTAR */
2215 break;
2216 case 'P':
2217 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2218 env->flags |= REG_EXTENDED|REG_CLASS_ESCAPE;
2219 typ = ERE;
2220 break;
2221 case 'S':
2222 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2223 env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT;
2224 typ = SRE;
2225 break;
2226 case 'U': /* PCRE_UNGREEDY */
2227 if (typ >= 0 || env->type == ERE && (env->flags & REG_CLASS_ESCAPE))
2228 {
2229 if (i)
2230 env->flags |= REG_MINIMAL;
2231 else
2232 env->flags &= ~REG_MINIMAL;
2233 }
2234 break;
2235 case 'V':
2236 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_REGEXP|REG_SHELL|REG_LEFT|REG_RIGHT);
2237 env->flags |= REG_REGEXP;
2238 typ = BRE;
2239 break;
2240 default:
2241 env->error = REG_BADRPT;
2242 return 0;
2243 }
2244 eat(env);
2245 c = token(env);
2246 }
2247 done:
2248 break;
2249 case ':':
2250 x = REX_GROUP;
2251 break;
2252 case '=':
2253 x = REX_GROUP_AHEAD;
2254 break;
2255 case '!':
2256 x = REX_GROUP_AHEAD_NOT;
2257 break;
2258 case '<':
2259 switch (token(env))
2260 {
2261 case '=':
2262 x = REX_GROUP_BEHIND;
2263 break;
2264 case '!':
2265 case T_BANG:
2266 x = REX_GROUP_BEHIND_NOT;
2267 break;
2268 default:
2269 env->error = REG_BADRPT;
2270 return 0;
2271 }
2272 eat(env);
2273 break;
2274 case '>':
2275 x = REX_GROUP_CUT;
2276 break;
2277 case '%':
2278 case T_PERCENT:
2279 e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short));
2280 e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor;
2281 n = 1;
2282 for (;;)
2283 {
2284 switch (i = chr(env, &esc))
2285 {
2286 case -1:
2287 case 0:
2288 invalid:
2289 env->cursor -= esc + 1;
2290 env->error = REG_EPAREN;
2291 return 0;
2292 case 'D':
2293 x = REX_NEST_delimiter;
2294 /*FALLTHROUGH*/
2295 delimiter:
2296 if ((i = chr(env, &esc)) < 0)
2297 goto invalid;
2298 if (e->re.nest.type[i] & ~x)
2299 goto invalid;
2300 e->re.nest.type[i] = x;
2301 continue;
2302 case 'E':
2303 x = REX_NEST_escape;
2304 goto delimiter;
2305 case 'L':
2306 x = REX_NEST_literal;
2307 goto quote;
2308 case 'O':
2309 switch (i = chr(env, &esc))
2310 {
2311 case 'T':
2312 e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator;
2313 break;
2314 default:
2315 goto invalid;
2316 }
2317 continue;
2318 case 'Q':
2319 x = REX_NEST_quote;
2320 /*FALLTHROUGH*/
2321 quote:
2322 if ((i = chr(env, &esc)) < 0)
2323 goto invalid;
2324 if (e->re.nest.type[i] & ~x)
2325 goto invalid;
2326 e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT);
2327 continue;
2328 case 'S':
2329 x = REX_NEST_separator;
2330 goto delimiter;
2331 case 'T':
2332 x = REX_NEST_terminator;
2333 goto delimiter;
2334 case '|':
2335 case '&':
2336 if (!esc)
2337 goto invalid;
2338 goto nesting;
2339 case '(':
2340 if (!esc)
2341 n++;
2342 goto nesting;
2343 case ')':
2344 if (!esc && !--n)
2345 break;
2346 goto nesting;
2347 default:
2348 nesting:
2349 if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
2350 goto invalid;
2351 e->re.nest.type[i] = REX_NEST_open;
2352 if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
2353 goto invalid;
2354 if (!esc)
2355 {
2356 if (x == ')' && !--n)
2357 goto invalid;
2358 else if (x == '(')
2359 n++;
2360 }
2361 e->re.nest.type[x] |= REX_NEST_close;
2362 e->re.nest.type[i] |= x << REX_NEST_SHIFT;
2363 continue;
2364 }
2365 break;
2366 }
2367 env->parnest--;
2368 if (c == T_PERCENT)
2369 for (n = 0; n < 2; n++)
2370 {
2371 parno = ++env->parno;
2372 if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2373 {
2374 drop(env->disc, e);
2375 return 0;
2376 }
2377 if (parno < elementsof(env->paren))
2378 env->paren[parno] = f;
2379 f->re.group.back = 0;
2380 f->re.group.number = parno;
2381 f->re.group.expr.rex = e;
2382 e = f;
2383 }
2384 return e;
2385 case '(':
2386 c = 0;
2387 if (isdigit(*env->cursor))
2388 {
2389 f = 0;
2390 do
2391 {
2392 if (c > (INT_MAX / 10))
2393 {
2394 env->error = REG_BADRPT;
2395 return 0;
2396 }
2397 c = c * 10 + (*env->cursor++ - '0');
2398 } while (isdigit(*env->cursor));
2399 if (*env->cursor++ != ')')
2400 {
2401 env->error = REG_BADRPT;
2402 return 0;
2403 }
2404 if (c && env->type >= SRE)
2405 c = c * 2 - 1;
2406 if (!c || c > env->parno || !env->paren[c])
2407 {
2408 if (!(env->flags & (REG_LENIENT|REG_REGEXP)))
2409 {
2410 env->error = REG_ESUBREG;
2411 return 0;
2412 }
2413 if (c)
2414 c = -1;
2415 }
2416 }
2417 else
2418 {
2419 if (env->type < SRE && *env->cursor++ != '?')
2420 {
2421 env->error = REG_BADRPT;
2422 return 0;
2423 }
2424 if (!(f = grp(env, parno + 1)) && env->error)
2425 return 0;
2426 }
2427 if (!(e = node(env, REX_GROUP_COND, 0, 0, 0)))
2428 {
2429 drop(env->disc, f);
2430 return 0;
2431 }
2432 e->re.group.size = c;
2433 e->re.group.expr.binary.left = f;
2434 if (!(e->re.group.expr.binary.right = alt(env, parno, 1)))
2435 {
2436 drop(env->disc, e);
2437 return 0;
2438 }
2439 if (token(env) != T_CLOSE)
2440 {
2441 env->error = REG_EPAREN;
2442 return 0;
2443 }
2444 eat(env);
2445 env->parnest--;
2446 return rep(env, e, parno, parno);
2447 case '{':
2448 p = env->cursor;
2449 n = 1;
2450 while (c = *env->cursor)
2451 {
2452 if (c == '\\' && *(env->cursor + 1))
2453 env->cursor++;
2454 else if (c == '{')
2455 n++;
2456 else if (c == '}' && !--n)
2457 break;
2458 else if (c == env->delimiter || c == env->terminator)
2459 break;
2460 env->cursor++;
2461 }
2462 if (c != '}')
2463 {
2464 env->error = REG_EBRACE;
2465 return 0;
2466 }
2467 if (*++env->cursor != ')')
2468 {
2469 env->error = REG_EPAREN;
2470 return 0;
2471 }
2472 env->cursor++;
2473 env->parnest--;
2474 if (env->disc->re_version < REG_VERSION_EXEC)
2475 {
2476 env->error = REG_BADRPT;
2477 return 0;
2478 }
2479 if (!env->disc->re_execf)
2480 return 0;
2481 if (!(e = node(env, REX_EXEC, 0, 0, 0)))
2482 return 0;
2483 e->re.exec.text = (const char*)p;
2484 e->re.exec.size = env->cursor - p - 2;
2485 if (!env->disc->re_compf)
2486 e->re.exec.data = 0;
2487 else
2488 e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc);
2489 return e;
2490 case '0': case '1': case '2': case '3': case '4':
2491 case '5': case '6': case '7': case '8': case '9':
2492 c -= '0';
2493 while (isdigit(*env->cursor))
2494 {
2495 if (c > (INT_MAX / 10))
2496 {
2497 env->error = REG_ESUBREG;
2498 return 0;
2499 }
2500 c = c * 10 + *env->cursor++ - '0';
2501 }
2502 if (*env->cursor == ')')
2503 {
2504 env->cursor++;
2505 env->parnest--;
2506 env->token.len = 1;
2507 if (c > env->parno || !env->paren[c])
2508 {
2509 env->error = REG_ESUBREG;
2510 return 0;
2511 }
2512 env->paren[c]->re.group.back = 1;
2513 return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
2514 }
2515 /*FALLTHROUGH*/
2516 default:
2517 env->error = REG_BADRPT;
2518 return 0;
2519 }
2520 p = env->pattern;
2521 i = env->type;
2522 if (x)
2523 {
2524 if (typ >= 0)
2525 env->type = typ;
2526 if (!(e = alt(env, parno, 0)))
2527 goto nope;
2528 env->flags = g;
2529 env->type = i;
2530 }
2531 c = token(env);
2532 env->parnest--;
2533 if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')'))
2534 {
2535 env->error = REG_EPAREN;
2536 goto nope;
2537 }
2538 eat(env);
2539 if (typ >= 0 && beg)
2540 env->pattern = env->cursor;
2541 if (!x)
2542 {
2543 if (typ >= 0)
2544 env->type = typ;
2545 return 0;
2546 }
2547 if (!(f = node(env, x, 0, 0, 0)))
2548 {
2549 drop(env->disc, e);
2550 goto nope;
2551 }
2552 f->re.group.expr.rex = e;
2553 if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT)
2554 {
2555 if (stats(env, e))
2556 {
2557 drop(env->disc, f);
2558 if (!env->error)
2559 env->error = REG_ECOUNT;
2560 goto nope;
2561 }
2562 f->re.group.size = env->stats.m;
2563 memset(&env->stats, 0, sizeof(env->stats));
2564 }
2565 switch (x)
2566 {
2567 case REX_GROUP:
2568 case REX_GROUP_CUT:
2569 f = rep(env, f, parno, env->parno);
2570 break;
2571 }
2572 if (f)
2573 return f;
2574 nope:
2575 env->flags = g;
2576 env->pattern = p;
2577 env->type = i;
2578 return 0;
2579 }
2580
2581 static Rex_t*
seq(Cenv_t * env)2582 seq(Cenv_t* env)
2583 {
2584 Rex_t* e;
2585 Rex_t* f;
2586 Token_t tok;
2587 int c;
2588 int i;
2589 int n;
2590 int x;
2591 int parno;
2592 int type;
2593 regflags_t flags;
2594 unsigned char* s;
2595 unsigned char* p;
2596 unsigned char* t;
2597 unsigned char* u;
2598 unsigned char buf[256];
2599
2600 for (;;)
2601 {
2602 s = buf;
2603 while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len])
2604 {
2605 x = c;
2606 p = env->cursor;
2607 if (c >= 0)
2608 {
2609 n = 1;
2610 *s++ = (env->flags & REG_ICASE) ? toupper(c) : c;
2611 }
2612 else if (c == C_ESC || (env->flags & REG_ICASE))
2613 {
2614 c = (c == C_ESC) ? env->token.lex : mbchar(p);
2615 if (env->flags & REG_ICASE)
2616 c = towupper(c);
2617 if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX)
2618 break;
2619 if ((n = mbconv((char*)s, c)) < 0)
2620 *s++ = c;
2621 else if (n)
2622 s += n;
2623 else
2624 {
2625 n = 1;
2626 *s++ = 0;
2627 }
2628 }
2629 else
2630 {
2631 n = env->token.len - env->token.esc;
2632 for (t = p, u = s + n; s < u; *s++ = *t++);
2633 }
2634 eat(env);
2635 }
2636 if (c == T_BAD)
2637 return 0;
2638 if (s > buf)
2639 switch (c)
2640 {
2641 case T_STAR:
2642 case T_PLUS:
2643 case T_LEFT:
2644 case T_QUES:
2645 case T_BANG:
2646 if ((s -= n) == buf)
2647 e = 0;
2648 else
2649 {
2650 i = s - buf;
2651 if (!(e = node(env, REX_STRING, 0, 0, i)))
2652 return 0;
2653 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i);
2654 e->re.string.size = i;
2655 }
2656 if (x >= 0)
2657 {
2658 if (!(f = node(env, REX_ONECHAR, 1, 1, 0)))
2659 {
2660 drop(env->disc, e);
2661 return 0;
2662 }
2663 f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x;
2664 }
2665 else
2666 {
2667 if (!(f = node(env, REX_STRING, 0, 0, n)))
2668 return 0;
2669 memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n);
2670 f->re.string.size = n;
2671 }
2672 if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env))))
2673 {
2674 drop(env->disc, e);
2675 return 0;
2676 }
2677 if (e)
2678 f = cat(env, e, f);
2679 return f;
2680 default:
2681 c = s - buf;
2682 if (!(e = node(env, REX_STRING, 0, 0, c)))
2683 return 0;
2684 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c);
2685 e->re.string.size = c;
2686 return cat(env, e, seq(env));
2687 }
2688 else if (c > T_BACK)
2689 {
2690 eat(env);
2691 c -= T_BACK;
2692 if (c > env->parno || !env->paren[c])
2693 {
2694 env->error = REG_ESUBREG;
2695 return 0;
2696 }
2697 env->paren[c]->re.group.back = 1;
2698 e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
2699 }
2700 else
2701 switch (c)
2702 {
2703 case T_AND:
2704 case T_CLOSE:
2705 case T_BAR:
2706 case T_END:
2707 return node(env, REX_NULL, 0, 0, 0);
2708 case T_DOLL:
2709 eat(env);
2710 e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0);
2711 break;
2712 case T_CFLX:
2713 eat(env);
2714 if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED))
2715 e = rep(env, e, 0, 0);
2716 break;
2717 case T_OPEN:
2718 tok = env->token;
2719 eat(env);
2720 flags = env->flags;
2721 type = env->type;
2722 if (env->token.att)
2723 env->flags |= REG_MINIMAL;
2724 env->parnest++;
2725 if (env->type == KRE)
2726 ++env->parno;
2727 parno = ++env->parno;
2728 if (!(e = alt(env, parno + 1, 0)))
2729 break;
2730 if (e->type == REX_NULL && env->type == ERE && !(env->flags & (REG_NULL|REG_REGEXP)))
2731 {
2732 drop(env->disc, e);
2733 env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL;
2734 return 0;
2735 }
2736 if (token(env) != T_CLOSE)
2737 {
2738 drop(env->disc, e);
2739 env->error = REG_EPAREN;
2740 return 0;
2741 }
2742 env->parnest--;
2743 eat(env);
2744 if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2745 {
2746 drop(env->disc, e);
2747 return 0;
2748 }
2749 if (parno < elementsof(env->paren))
2750 env->paren[parno] = f;
2751 f->re.group.back = 0;
2752 f->re.group.number = parno;
2753 f->re.group.expr.rex = e;
2754 if (tok.lex)
2755 {
2756 tok.push = 1;
2757 env->token = tok;
2758 }
2759 if (!(e = rep(env, f, parno, env->parno)))
2760 return 0;
2761 if (env->type == KRE)
2762 {
2763 if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2764 {
2765 drop(env->disc, e);
2766 return 0;
2767 }
2768 if (--parno < elementsof(env->paren))
2769 env->paren[parno] = f;
2770 f->re.group.back = 0;
2771 f->re.group.number = parno;
2772 f->re.group.expr.rex = e;
2773 e = f;
2774 }
2775 env->flags = flags;
2776 env->type = type;
2777 break;
2778 case T_GROUP:
2779 p = env->cursor;
2780 eat(env);
2781 flags = env->flags;
2782 type = env->type;
2783 if (!(e = grp(env, env->parno + 1)))
2784 {
2785 if (env->error)
2786 return 0;
2787 if (env->literal == env->pattern && env->literal == p)
2788 env->literal = env->cursor;
2789 continue;
2790 }
2791 env->flags = flags;
2792 env->type = type;
2793 break;
2794 case T_BRA:
2795 eat(env);
2796 if (e = bra(env))
2797 e = rep(env, e, 0, 0);
2798 break;
2799 case T_ALNUM:
2800 case T_ALNUM_NOT:
2801 case T_DIGIT:
2802 case T_DIGIT_NOT:
2803 case T_SPACE:
2804 case T_SPACE_NOT:
2805 eat(env);
2806 if (e = ccl(env, c))
2807 e = rep(env, e, 0, 0);
2808 break;
2809 case T_LT:
2810 eat(env);
2811 e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0);
2812 break;
2813 case T_GT:
2814 eat(env);
2815 e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0);
2816 break;
2817 case T_DOT:
2818 eat(env);
2819 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
2820 break;
2821 case T_DOTSTAR:
2822 eat(env);
2823 env->token.lex = T_STAR;
2824 env->token.push = 1;
2825 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
2826 break;
2827 case T_SLASHPLUS:
2828 eat(env);
2829 env->token.lex = T_PLUS;
2830 env->token.push = 1;
2831 if (e = node(env, REX_ONECHAR, 1, 1, 0))
2832 {
2833 e->re.onechar = '/';
2834 e = rep(env, e, 0, 0);
2835 }
2836 break;
2837 case T_WORD:
2838 eat(env);
2839 e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0);
2840 break;
2841 case T_WORD_NOT:
2842 eat(env);
2843 e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0);
2844 break;
2845 case T_BEG_STR:
2846 eat(env);
2847 e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0);
2848 break;
2849 case T_END_STR:
2850 eat(env);
2851 e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0);
2852 break;
2853 case T_FIN_STR:
2854 eat(env);
2855 e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0);
2856 break;
2857 default:
2858 env->error = REG_BADRPT;
2859 return 0;
2860 }
2861 if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator)
2862 e = cat(env, e, seq(env));
2863 return e;
2864 }
2865 }
2866
2867 static Rex_t*
con(Cenv_t * env)2868 con(Cenv_t* env)
2869 {
2870 Rex_t* e;
2871 Rex_t* f;
2872 Rex_t* g;
2873
2874 if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND)
2875 return e;
2876 eat(env);
2877 if (!(f = con(env)))
2878 {
2879 drop(env->disc, e);
2880 return 0;
2881 }
2882 if (!(g = node(env, REX_CONJ, 0, 0, 0)))
2883 {
2884 drop(env->disc, e);
2885 drop(env->disc, f);
2886 return 0;
2887 }
2888 g->re.group.expr.binary.left = e;
2889 g->re.group.expr.binary.right = f;
2890 return g;
2891 }
2892
2893 static Rex_t*
alt(Cenv_t * env,int number,int cond)2894 alt(Cenv_t* env, int number, int cond)
2895 {
2896 Rex_t* e;
2897 Rex_t* f;
2898 Rex_t* g;
2899
2900 if (!(e = con(env)))
2901 return 0;
2902 else if (token(env) != T_BAR)
2903 {
2904 if (!cond)
2905 return e;
2906 f = 0;
2907 if (e->type == REX_NULL)
2908 goto bad;
2909 }
2910 else
2911 {
2912 eat(env);
2913 if (!(f = alt(env, number, 0)))
2914 {
2915 drop(env->disc, e);
2916 return 0;
2917 }
2918 if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & (REG_NULL|REG_REGEXP)))
2919 goto bad;
2920 if (!cond && (g = trie(env, e, f)))
2921 return g;
2922 }
2923 if (!(g = node(env, REX_ALT, 0, 0, 0)))
2924 {
2925 env->error = REG_ESPACE;
2926 goto bad;
2927 }
2928 g->re.group.number = number;
2929 g->re.group.last = env->parno;
2930 g->re.group.expr.binary.left = e;
2931 g->re.group.expr.binary.right = f;
2932 return g;
2933 bad:
2934 drop(env->disc, e);
2935 drop(env->disc, f);
2936 if (!env->error)
2937 env->error = REG_ENULL;
2938 return 0;
2939 }
2940
2941 /*
2942 * add v to REX_BM tables
2943 */
2944
2945 static void
bmstr(Cenv_t * env,register Rex_t * a,unsigned char * v,int n,Bm_mask_t b)2946 bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b)
2947 {
2948 int c;
2949 int m;
2950 size_t z;
2951
2952 for (m = 0; m < n; m++)
2953 {
2954 if (!(z = n - m - 1))
2955 z = HIT;
2956 c = v[m];
2957 a->re.bm.mask[m][c] |= b;
2958 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
2959 a->re.bm.skip[c] = z;
2960 if (a->flags & REG_ICASE)
2961 {
2962 if (isupper(c))
2963 c = tolower(c);
2964 else if (islower(c))
2965 c = toupper(c);
2966 else
2967 continue;
2968 a->re.bm.mask[m][c] |= b;
2969 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
2970 a->re.bm.skip[c] = z;
2971 }
2972 }
2973 }
2974
2975 /*
2976 * set up BM table from trie
2977 */
2978
2979 static int
bmtrie(Cenv_t * env,Rex_t * a,unsigned char * v,Trie_node_t * x,int n,int m,Bm_mask_t b)2980 bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b)
2981 {
2982 do
2983 {
2984 v[m] = x->c;
2985 if (m >= (n - 1))
2986 {
2987 bmstr(env, a, v, n, b);
2988 if (!(b <<= 1))
2989 {
2990 b = 1;
2991 a->re.bm.complete = 0;
2992 }
2993 else if (x->son)
2994 a->re.bm.complete = 0;
2995 }
2996 else if (x->son)
2997 b = bmtrie(env, a, v, x->son, n, m + 1, b);
2998 } while (x = x->sib);
2999 return b;
3000 }
3001
3002 /*
3003 * rewrite the expression tree for some special cases
3004 * 1. it is a null expression - illegal
3005 * 2. max length fixed string found -- use BM algorithm
3006 * 3. it begins with an unanchored string - use KMP algorithm
3007 * 0 returned on success
3008 */
3009
3010 static int
special(Cenv_t * env,regex_t * p)3011 special(Cenv_t* env, regex_t* p)
3012 {
3013 Rex_t* a;
3014 Rex_t* e;
3015 Rex_t* t;
3016 Rex_t* x;
3017 Rex_t* y;
3018 unsigned char* s;
3019 int* f;
3020 int n;
3021 int m;
3022 int k;
3023
3024 DEBUG_INIT();
3025 if (e = p->env->rex)
3026 {
3027 if ((x = env->stats.x) && x->re.string.size < 3)
3028 x = 0;
3029 if ((t = env->stats.y) && t->re.trie.min < 3)
3030 t = 0;
3031 if (x && t)
3032 {
3033 if (x->re.string.size >= t->re.trie.min)
3034 t = 0;
3035 else
3036 x = 0;
3037 }
3038 if (x || t)
3039 {
3040 Bm_mask_t** mask;
3041 Bm_mask_t* h;
3042 unsigned char* v;
3043 size_t* q;
3044 size_t l;
3045 int i;
3046 int j;
3047
3048 if (x)
3049 {
3050 y = x;
3051 n = m = x->re.string.size;
3052 l = env->stats.l;
3053 }
3054 else
3055 {
3056 y = t;
3057 n = t->re.trie.min;
3058 m = t->re.trie.max;
3059 l = env->stats.k;
3060 }
3061 if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t))))
3062 return 1;
3063 if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t))))
3064 {
3065 alloc(env->disc, q, 0);
3066 return 1;
3067 }
3068 a->flags = y->flags;
3069 a->map = y->map;
3070 a->re.bm.size = n;
3071 a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1;
3072 a->re.bm.left = l - 1;
3073 a->re.bm.right = env->stats.m - l - n;
3074 a->re.bm.complete = (env->stats.e || y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n;
3075 h = (Bm_mask_t*)&a->re.bm.mask[n];
3076 a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1));
3077 a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1];
3078 for (m = 0; m <= UCHAR_MAX; m++)
3079 a->re.bm.skip[m] = n;
3080 a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left);
3081 for (i = 1; i <= n; i++)
3082 a->re.bm.fail[i] = 2 * n - i;
3083 mask = a->re.bm.mask;
3084 for (m = 0; m < n; m++)
3085 {
3086 mask[m] = h;
3087 h += UCHAR_MAX + 1;
3088 }
3089 if (x)
3090 bmstr(env, a, x->re.string.base, n, 1);
3091 else
3092 {
3093 v = (unsigned char*)q;
3094 memset(v, 0, n);
3095 m = 1;
3096 for (i = 0; i <= UCHAR_MAX; i++)
3097 if (t->re.trie.root[i])
3098 m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m);
3099 }
3100 mask--;
3101 memset(q, 0, n * sizeof(*q));
3102 for (k = (j = n) + 1; j > 0; j--, k--)
3103 {
3104 DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0));
3105 for (q[j] = k; k <= n; k = q[k])
3106 {
3107 for (m = 0; m <= UCHAR_MAX; m++)
3108 if (mask[k][m] == mask[j][m])
3109 {
3110 DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0));
3111 goto cut;
3112 }
3113 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0));
3114 if (a->re.bm.fail[k] > n - j)
3115 a->re.bm.fail[k] = n - j;
3116 }
3117 cut: ;
3118 }
3119 for (i = 1; i <= n; i++)
3120 if (a->re.bm.fail[i] > n + k - i)
3121 {
3122 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0));
3123 a->re.bm.fail[i] = n + k - i;
3124 }
3125 #if _AST_REGEX_DEBUG
3126 if (DEBUG_TEST(0x0020,(1),(0)))
3127 {
3128 sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0);
3129 for (m = 0; m < n; m++)
3130 for (i = 1; i <= UCHAR_MAX; i++)
3131 if (a->re.bm.mask[m][i])
3132 sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]);
3133 for (i = ' '; i <= UCHAR_MAX; i++)
3134 if (a->re.bm.skip[i] >= HIT)
3135 sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i);
3136 else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n)
3137 sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]);
3138 for (j = 31; j >= 0; j--)
3139 {
3140 sfprintf(sfstderr, " ");
3141 next:
3142 for (m = 0; m < n; m++)
3143 {
3144 for (i = 0040; i < 0177; i++)
3145 if (a->re.bm.mask[m][i] & (1 << j))
3146 {
3147 sfprintf(sfstderr, " %c", i);
3148 break;
3149 }
3150 if (i >= 0177)
3151 {
3152 if (j > 0)
3153 {
3154 j--;
3155 goto next;
3156 }
3157 sfprintf(sfstderr, " ?");
3158 }
3159 }
3160 sfprintf(sfstderr, "\n");
3161 }
3162 sfprintf(sfstderr, "FAIL: ");
3163 for (m = 1; m <= n; m++)
3164 sfprintf(sfstderr, "%3d", a->re.bm.fail[m]);
3165 sfprintf(sfstderr, "\n");
3166 }
3167 #endif
3168 alloc(env->disc, q, 0);
3169 a->next = e;
3170 p->env->rex = a;
3171 return 0;
3172 }
3173 switch (e->type)
3174 {
3175 case REX_BEG:
3176 if (env->flags & REG_NEWLINE)
3177 return 0;
3178 break;
3179 case REX_GROUP:
3180 if (env->stats.b)
3181 return 0;
3182 e = e->re.group.expr.rex;
3183 if (e->type != REX_DOT)
3184 return 0;
3185 /*FALLTHROUGH*/
3186 case REX_DOT:
3187 if (e->lo == 0 && e->hi == RE_DUP_INF)
3188 break;
3189 return 0;
3190 case REX_NULL:
3191 if (env->flags & (REG_NULL|REG_REGEXP))
3192 break;
3193 env->error = REG_ENULL;
3194 return 1;
3195 case REX_STRING:
3196 if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map)
3197 return 0;
3198 s = e->re.string.base;
3199 n = e->re.string.size;
3200 if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1))))
3201 return 1;
3202 a->flags = e->flags;
3203 a->map = e->map;
3204 f = a->re.string.fail;
3205 memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n);
3206 s = a->re.string.base;
3207 a->re.string.size = n;
3208 f[0] = m = -1;
3209 for (k = 1; k < n; k++)
3210 {
3211 while (m >= 0 && s[m+1] != s[k])
3212 m = f[m];
3213 if (s[m+1] == s[k])
3214 m++;
3215 f[k] = m;
3216 }
3217 a->next = e->next;
3218 p->env->rex = a;
3219 e->next = 0;
3220 drop(env->disc, e);
3221 break;
3222 default:
3223 return 0;
3224 }
3225 }
3226 p->env->once = 1;
3227 return 0;
3228 }
3229
3230 int
regcomp(regex_t * p,const char * pattern,regflags_t flags)3231 regcomp(regex_t* p, const char* pattern, regflags_t flags)
3232 {
3233 Rex_t* e;
3234 Rex_t* f;
3235 regdisc_t* disc;
3236 unsigned char* fold;
3237 int i;
3238 Cenv_t env;
3239
3240 if (!p)
3241 return REG_BADPAT;
3242 if (flags & REG_DISCIPLINE)
3243 {
3244 flags &= ~REG_DISCIPLINE;
3245 disc = p->re_disc;
3246 }
3247 else
3248 disc = &state.disc;
3249 if (!disc->re_errorlevel)
3250 disc->re_errorlevel = 2;
3251 p->env = 0;
3252 if (!pattern)
3253 return fatal(disc, REG_BADPAT, pattern);
3254 if (!state.initialized)
3255 {
3256 state.initialized = 1;
3257 for (i = 0; i < elementsof(state.escape); i++)
3258 state.magic[state.escape[i].key] = state.escape[i].val;
3259 }
3260 if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data))
3261 {
3262 if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1)))
3263 return fatal(disc, REG_ESPACE, pattern);
3264 for (i = 0; i <= UCHAR_MAX; i++)
3265 fold[i] = toupper(i);
3266 LCINFO(AST_LC_CTYPE)->data = (void*)fold;
3267 }
3268 again:
3269 if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t))))
3270 return fatal(disc, REG_ESPACE, pattern);
3271 memset(p->env, 0, sizeof(*p->env));
3272 memset(&env, 0, sizeof(env));
3273 env.regex = p;
3274 env.flags = flags;
3275 env.disc = p->env->disc = disc;
3276 if (env.flags & REG_AUGMENTED)
3277 env.flags |= REG_EXTENDED;
3278 env.mappeddot = '.';
3279 env.mappednewline = '\n';
3280 env.mappedslash = '/';
3281 if (disc->re_version >= REG_VERSION_MAP && disc->re_map)
3282 {
3283 env.map = disc->re_map;
3284 env.MAP = p->env->fold;
3285 for (i = 0; i <= UCHAR_MAX; i++)
3286 {
3287 env.MAP[i] = fold[env.map[i]];
3288 if (env.map[i] == '.')
3289 env.mappeddot = i;
3290 if (env.map[i] == '\n')
3291 env.mappednewline = i;
3292 if (env.map[i] == '/')
3293 env.mappedslash = i;
3294 }
3295 }
3296 else
3297 env.MAP = fold;
3298 env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE;
3299 env.explicit = -1;
3300 if (env.flags & REG_SHELL)
3301 {
3302 if (env.flags & REG_SHELL_PATH)
3303 env.explicit = env.mappedslash;
3304 if (!(env.flags & REG_SHELL_ESCAPED))
3305 env.flags |= REG_CLASS_ESCAPE;
3306 env.flags |= REG_LENIENT|REG_NULL;
3307 env.type = env.type == BRE ? SRE : KRE;
3308 }
3309 else
3310 env.flags &= ~(REG_SHELL_DOT|REG_SHELL_ESCAPED|REG_SHELL_GROUP|REG_SHELL_PATH);
3311 if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE)
3312 env.explicit = env.mappednewline;
3313 p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1;
3314 env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL));
3315 env.token.lex = 0;
3316 env.token.push = 0;
3317 if (env.flags & REG_DELIMITED)
3318 {
3319 switch (env.delimiter = *pattern++)
3320 {
3321 case 0:
3322 case '\\':
3323 case '\n':
3324 case '\r':
3325 env.error = REG_EDELIM;
3326 goto bad;
3327 }
3328 env.terminator = '\n';
3329 }
3330 env.literal = env.pattern = env.cursor = (unsigned char*)pattern;
3331 if (!(p->env->rex = alt(&env, 1, 0)))
3332 goto bad;
3333 if (env.parnest)
3334 {
3335 env.error = REG_EPAREN;
3336 goto bad;
3337 }
3338 if ((env.flags & REG_LEFT) && p->env->rex->type != REX_BEG)
3339 {
3340 if (p->env->rex->type == REX_ALT)
3341 env.flags &= ~REG_FIRST;
3342 if (!(e = node(&env, REX_BEG, 0, 0, 0)))
3343 {
3344 regfree(p);
3345 return fatal(disc, REG_ESPACE, pattern);
3346 }
3347 e->next = p->env->rex;
3348 p->env->rex = e;
3349 p->env->once = 1;
3350 }
3351 for (e = p->env->rex; e->next; e = e->next);
3352 p->env->done.type = REX_DONE;
3353 p->env->done.flags = e->flags;
3354 if ((env.flags & REG_RIGHT) && e->type != REX_END)
3355 {
3356 if (p->env->rex->type == REX_ALT)
3357 env.flags &= ~REG_FIRST;
3358 if (!(f = node(&env, REX_END, 0, 0, 0)))
3359 {
3360 regfree(p);
3361 return fatal(disc, REG_ESPACE, pattern);
3362 }
3363 f->flags = e->flags;
3364 f->map = e->map;
3365 e->next = f;
3366 }
3367 if (stats(&env, p->env->rex))
3368 {
3369 if (!env.error)
3370 env.error = REG_ECOUNT;
3371 goto bad;
3372 }
3373 if (env.stats.b)
3374 p->env->hard = p->env->separate = 1;
3375 else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c)))
3376 p->env->hard = 1;
3377 if (p->env->hard || env.stats.c || env.stats.i)
3378 p->env->stats.re_min = p->env->stats.re_max = -1;
3379 else
3380 {
3381 if (!(p->env->stats.re_min = env.stats.m))
3382 p->env->stats.re_min = -1;
3383 if (!(p->env->stats.re_max = env.stats.n))
3384 p->env->stats.re_max = -1;
3385 }
3386 if (special(&env, p))
3387 goto bad;
3388 serialize(&env, p->env->rex, 1);
3389 p->re_nsub = env.stats.p;
3390 if (env.type == KRE)
3391 p->re_nsub /= 2;
3392 if (env.flags & REG_DELIMITED)
3393 {
3394 p->re_npat = env.cursor - env.pattern + 1;
3395 if (*env.cursor == env.delimiter)
3396 p->re_npat++;
3397 else if (env.flags & REG_MUSTDELIM)
3398 {
3399 env.error = REG_EDELIM;
3400 goto bad;
3401 }
3402 else
3403 env.flags &= ~REG_DELIMITED;
3404 }
3405 p->env->explicit = env.explicit;
3406 p->env->flags = env.flags & REG_COMP;
3407 p->env->min = env.stats.m;
3408 p->env->nsub = env.stats.p + env.stats.u;
3409 p->env->refs = 1;
3410 return 0;
3411 bad:
3412 regfree(p);
3413 if (!env.error)
3414 env.error = REG_ESPACE;
3415 if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL))
3416 {
3417 flags |= REG_LITERAL;
3418 pattern = (const char*)env.literal;
3419 goto again;
3420 }
3421 return fatal(disc, env.error, pattern);
3422 }
3423
3424 /*
3425 * regcomp() on sized pattern
3426 * the lazy way around adding and checking env.end
3427 */
3428
3429 int
regncomp(regex_t * p,const char * pattern,size_t size,regflags_t flags)3430 regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags)
3431 {
3432 char* s;
3433 int r;
3434
3435 if (!(s = malloc(size + 1)))
3436 return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern);
3437 memcpy(s, pattern, size);
3438 s[size] = 0;
3439 r = regcomp(p, s, flags);
3440 free(s);
3441 return r;
3442 }
3443
3444 /*
3445 * combine two compiled regular expressions if possible,
3446 * replacing first with the combination and freeing second.
3447 * return 0 on success.
3448 * the only combinations handled are building a Trie
3449 * from String|Kmp|Trie and String|Kmp
3450 */
3451
3452 int
regcomb(regex_t * p,regex_t * q)3453 regcomb(regex_t* p, regex_t* q)
3454 {
3455 Rex_t* e = p->env->rex;
3456 Rex_t* f = q->env->rex;
3457 Rex_t* g;
3458 Rex_t* h;
3459 Cenv_t env;
3460
3461 if (!e || !f)
3462 return fatal(p->env->disc, REG_BADPAT, NiL);
3463 if (p->env->separate || q->env->separate)
3464 return REG_ESUBREG;
3465 memset(&env, 0, sizeof(env));
3466 env.disc = p->env->disc;
3467 if (e->type == REX_BM)
3468 {
3469 p->env->rex = e->next;
3470 e->next = 0;
3471 drop(env.disc, e);
3472 e = p->env->rex;
3473 }
3474 if (f->type == REX_BM)
3475 {
3476 q->env->rex = f->next;
3477 f->next = 0;
3478 drop(env.disc, f);
3479 f = q->env->rex;
3480 }
3481 if (e->type == REX_BEG && f->type == REX_BEG)
3482 {
3483 p->env->flags |= REG_LEFT;
3484 p->env->rex = e->next;
3485 e->next = 0;
3486 drop(env.disc, e);
3487 e = p->env->rex;
3488 q->env->rex = f->next;
3489 f->next = 0;
3490 drop(env.disc, f);
3491 f = q->env->rex;
3492 }
3493 for (g = e; g->next; g = g->next);
3494 for (h = f; h->next; h = h->next);
3495 if (g->next && g->next->type == REX_END && h->next && h->next->type == REX_END)
3496 {
3497 p->env->flags |= REG_RIGHT;
3498 drop(env.disc, g->next);
3499 g->next = 0;
3500 drop(env.disc, h->next);
3501 h->next = 0;
3502 }
3503 if (!(g = trie(&env, f, e)))
3504 return fatal(p->env->disc, REG_BADPAT, NiL);
3505 p->env->rex = g;
3506 if (!q->env->once)
3507 p->env->once = 0;
3508 q->env->rex = 0;
3509 if (p->env->flags & REG_LEFT)
3510 {
3511 if (!(e = node(&env, REX_BEG, 0, 0, 0)))
3512 {
3513 regfree(p);
3514 return fatal(p->env->disc, REG_ESPACE, NiL);
3515 }
3516 e->next = p->env->rex;
3517 p->env->rex = e;
3518 p->env->once = 1;
3519 }
3520 if (p->env->flags & REG_RIGHT)
3521 {
3522 for (f = p->env->rex; f->next; f = f->next);
3523 if (f->type != REX_END)
3524 {
3525 if (!(e = node(&env, REX_END, 0, 0, 0)))
3526 {
3527 regfree(p);
3528 return fatal(p->env->disc, REG_ESPACE, NiL);
3529 }
3530 f->next = e;
3531 }
3532 }
3533 env.explicit = p->env->explicit;
3534 env.flags = p->env->flags;
3535 env.disc = p->env->disc;
3536 if (stats(&env, p->env->rex))
3537 {
3538 regfree(p);
3539 return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL);
3540 }
3541 if (special(&env, p))
3542 {
3543 regfree(p);
3544 return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL);
3545 }
3546 p->env->min = g->re.trie.min;
3547 return 0;
3548 }
3549
3550 /*
3551 * copy a reference of p into q
3552 * p and q may then have separate regsubcomp() instantiations
3553 */
3554
3555 int
regdup(regex_t * p,regex_t * q)3556 regdup(regex_t* p, regex_t* q)
3557 {
3558 if (!p || !q)
3559 return REG_BADPAT;
3560 *q = *p;
3561 p->env->refs++;
3562 q->re_sub = 0;
3563 return 0;
3564 }
3565