1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1985-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * Phong Vo <kpv@research.att.com> *
20 * *
21 ***********************************************************************/
22 #pragma prototyped
23
24 /*
25 * posix regex compiler
26 */
27
28 #include "reglib.h"
29
30 #if _PACKAGE_ast
31 #include "lclib.h"
32 #endif
33
34 #define serialize re_serialize /* hp.ia64 <unistd.h>! */
35
36 #define C_ESC (-1)
37 #define C_MB (-2)
38
39 #if _AST_REGEX_DEBUG
40
41 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n))
42 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0)
43 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0)
44
45 static unsigned long debug;
46 static unsigned long debug_flag;
47
48 #else
49
50 #define DEBUG_INIT()
51 #define DEBUG_TEST(f,y,n) (n)
52 #define DEBUG_CODE(f,y,n) do {n} while(0)
53
54 #endif
55
56 #if _PACKAGE_ast
57
58 typedef struct Cchr_s
59 {
60 Dtlink_t lnk;
61 unsigned char nam[2];
62 Ckey_t key;
63 } Cchr_t;
64
65 #endif
66
67 #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0)
68
69 /*
70 * determine whether greedy matching will work, i.e. produce
71 * the best match first. such expressions are "easy", and
72 * need no backtracking once a complete match is found.
73 * if an expression has backreferences or alts it's hard
74 * else if it has only one closure it's easy
75 * else if all closures are simple (i.e. one-character) it's easy
76 * else it's hard.
77 */
78
79 typedef struct Stats_s
80 {
81 unsigned long l; /* min length to left of x */
82 unsigned long k; /* min length to left of y */
83 unsigned long m; /* min length */
84 unsigned long n; /* max length */
85 unsigned short a; /* number of alternations */
86 unsigned short b; /* number of backrefs */
87 unsigned short c; /* number of closures */
88 unsigned short e; /* $ */
89 unsigned short i; /* number of negations */
90 unsigned short p; /* number of named subexpressions */
91 unsigned short s; /* number of simple closures */
92 unsigned short t; /* number of tries */
93 unsigned short u; /* number of unnamed subexpressions */
94 Rex_t* x; /* max length REX_STRING */
95 Rex_t* y; /* max length REX_TRIE */
96 } Stats_t;
97
98 typedef struct Token_s
99 {
100 unsigned long min;
101 unsigned long max;
102 short lex;
103 short len;
104 short esc;
105 short att;
106 short push;
107 } Token_t;
108
109 typedef struct Cenv_s
110 {
111 int delimiter; /* pattern delimiter */
112 int error; /* last error */
113 int explicit; /* explicit match on this char */
114 int mappeddot; /* inverse mapped '.' */
115 int mappednewline; /* inverse mapped '\n' */
116 int mappedslash; /* inverse mapped '/' */
117 regflags_t flags; /* flags arg to regcomp */
118 int type; /* BRE,ERE,ARE,SRE,KRE */
119 unsigned char* cursor; /* curent point in re */
120 unsigned char* pattern; /* the original pattern */
121 unsigned char* literal; /* literal restart pattern */
122 int parno; /* number of last open paren */
123 int parnest; /* paren nest count */
124 int posixkludge; /* to make * nonspecial */
125 int regexp; /* <regexp.h> compatibility */
126 Token_t token; /* token lookahead */
127 Stats_t stats; /* RE statistics */
128 int terminator; /* pattern terminator */
129 Rex_t* paren[2*(BACK_REF_MAX+2)];
130 /* paren[i]!=0 if \i defined */
131 regex_t* regex; /* user handle */
132 regdisc_t* disc; /* user discipline */
133 unsigned char* map; /* external to native ccode map */
134 unsigned char* MAP; /* fold and/or map */
135 } Cenv_t;
136
137 /*
138 * allocate a new Rex_t node
139 */
140
141 static Rex_t*
node(Cenv_t * env,int type,int lo,int hi,size_t extra)142 node(Cenv_t* env, int type, int lo, int hi, size_t extra)
143 {
144 register Rex_t* e;
145
146 if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra))
147 {
148 memset(e, 0, sizeof(Rex_t) + extra);
149 e->type = type;
150 e->marked = 0;
151 e->lo = lo;
152 e->hi = hi;
153 e->flags = env->flags;
154 e->map = (e->flags & REG_ICASE) ? env->MAP : env->map;
155 e->explicit = env->explicit;
156 if (extra)
157 e->re.data = (char*)e + sizeof(Rex_t);
158 }
159 return e;
160 }
161
162 /*
163 * free a Trie_node_t node
164 */
165
166 static void
triedrop(regdisc_t * disc,Trie_node_t * e)167 triedrop(regdisc_t* disc, Trie_node_t* e)
168 {
169 if (e)
170 {
171 triedrop(disc, e->son);
172 triedrop(disc, e->sib);
173 alloc(disc, e, 0);
174 }
175 }
176
177 /*
178 * free a Rex_t node
179 */
180
181 void
drop(regdisc_t * disc,Rex_t * e)182 drop(regdisc_t* disc, Rex_t* e)
183 {
184 int i;
185 Rex_t* x;
186
187 if (e && !(disc->re_flags & REG_NOFREE))
188 do
189 {
190 switch (e->type)
191 {
192 case REX_ALT:
193 case REX_CONJ:
194 drop(disc, e->re.group.expr.binary.left);
195 drop(disc, e->re.group.expr.binary.right);
196 break;
197 case REX_GROUP:
198 case REX_GROUP_AHEAD:
199 case REX_GROUP_AHEAD_NOT:
200 case REX_GROUP_BEHIND:
201 case REX_GROUP_BEHIND_NOT:
202 case REX_GROUP_CUT:
203 case REX_NEG:
204 case REX_REP:
205 drop(disc, e->re.group.expr.rex);
206 break;
207 case REX_TRIE:
208 for (i = 0; i <= UCHAR_MAX; i++)
209 triedrop(disc, e->re.trie.root[i]);
210 break;
211 }
212 x = e->next;
213 alloc(disc, e, 0);
214 } while (e = x);
215 }
216
217 /*
218 * mark e and descendants minimal
219 */
220
221 static void
mark(register Rex_t * e,int set)222 mark(register Rex_t* e, int set)
223 {
224 if (e && !e->marked)
225 do
226 {
227 e->marked = 1;
228 if (set)
229 e->flags |= REG_MINIMAL;
230 else
231 e->flags &= ~REG_MINIMAL;
232 switch (e->type)
233 {
234 case REX_ALT:
235 case REX_CONJ:
236 case REX_GROUP_COND:
237 if (e->re.group.expr.binary.left)
238 mark(e->re.group.expr.binary.left, set);
239 if (e->re.group.expr.binary.right)
240 mark(e->re.group.expr.binary.right, set);
241 break;
242 case REX_GROUP:
243 case REX_GROUP_AHEAD:
244 case REX_GROUP_AHEAD_NOT:
245 case REX_GROUP_BEHIND:
246 case REX_GROUP_BEHIND_NOT:
247 case REX_GROUP_CUT:
248 case REX_NEG:
249 case REX_REP:
250 case REX_TRIE:
251 mark(e->re.group.expr.rex, set);
252 break;
253 }
254 } while (e = e->next);
255 }
256
257 /*
258 * assign subexpression numbers by a preorder tree walk
259 */
260
261 static int
serialize(Cenv_t * env,Rex_t * e,int n)262 serialize(Cenv_t* env, Rex_t* e, int n)
263 {
264 do
265 {
266 e->serial = n++;
267 switch (e->type)
268 {
269 case REX_ALT:
270 case REX_GROUP_COND:
271 if (e->re.group.expr.binary.left)
272 n = serialize(env, e->re.group.expr.binary.left, n);
273 e->re.group.expr.binary.serial = n++;
274 if (e->re.group.expr.binary.right)
275 n = serialize(env, e->re.group.expr.binary.right, n);
276 break;
277 case REX_CONJ:
278 n = serialize(env, e->re.group.expr.binary.left, n);
279 n = serialize(env, e->re.group.expr.binary.right, n);
280 break;
281 case REX_GROUP:
282 case REX_GROUP_AHEAD:
283 case REX_GROUP_AHEAD_NOT:
284 case REX_GROUP_BEHIND:
285 case REX_GROUP_BEHIND_NOT:
286 case REX_GROUP_CUT:
287 case REX_NEG:
288 case REX_REP:
289 n = serialize(env, e->re.group.expr.rex, n);
290 break;
291 }
292 } while (e = e->next);
293 return n;
294 }
295
296 /*
297 * catenate e and f into a sequence, collapsing them if possible
298 */
299
300 static Rex_t*
cat(Cenv_t * env,Rex_t * e,Rex_t * f)301 cat(Cenv_t* env, Rex_t* e, Rex_t* f)
302 {
303 Rex_t* g;
304
305 if (!f)
306 {
307 drop(env->disc, e);
308 return 0;
309 }
310 if (e->type == REX_NULL)
311 {
312 drop(env->disc, e);
313 return f;
314 }
315 if (f->type == REX_NULL)
316 {
317 g = f->next;
318 f->next = 0;
319 drop(env->disc, f);
320 f = g;
321 }
322 else if (e->type == REX_DOT && f->type == REX_DOT)
323 {
324 unsigned int m = e->lo + f->lo;
325 unsigned int n = e->hi + f->hi;
326
327 if (m <= RE_DUP_MAX)
328 {
329 if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX)
330 {
331 n = RE_DUP_INF;
332 goto combine;
333 }
334 else if (n <= RE_DUP_MAX)
335 {
336 combine:
337 e->lo = m;
338 e->hi = n;
339 g = f->next;
340 f->next = 0;
341 drop(env->disc, f);
342 f = g;
343 }
344 }
345 }
346 e->next = f;
347 return e;
348 }
349
350 /*
351 * collect re statistics
352 */
353
354 static int
stats(register Cenv_t * env,register Rex_t * e)355 stats(register Cenv_t* env, register Rex_t* e)
356 {
357 register unsigned long n;
358 register unsigned long m;
359 unsigned long cm;
360 unsigned long nm;
361 unsigned long cn;
362 unsigned long nn;
363 unsigned long l;
364 unsigned long k;
365 unsigned long t;
366 Rex_t* q;
367 Rex_t* x;
368 Rex_t* y;
369 unsigned char c;
370 unsigned char b;
371
372 do
373 {
374 switch (e->type)
375 {
376 case REX_ALT:
377 x = env->stats.x;
378 l = env->stats.l;
379 y = env->stats.y;
380 k = env->stats.k;
381 t = env->stats.t;
382 if (++env->stats.a <= 0)
383 return 1;
384 cm = env->stats.m;
385 env->stats.m = 0;
386 cn = env->stats.n;
387 env->stats.n = 0;
388 if (stats(env, e->re.group.expr.binary.left))
389 return 1;
390 m = env->stats.m;
391 env->stats.m = 0;
392 n = env->stats.n;
393 env->stats.n = 0;
394 if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right))
395 return 1;
396 if (env->stats.m > m)
397 env->stats.m = m;
398 else
399 m = env->stats.m;
400 if ((env->stats.m += cm) < m)
401 return 1;
402 if (env->stats.n < n)
403 env->stats.n = n;
404 else
405 n = env->stats.n;
406 if ((env->stats.n += cn) < n)
407 return 1;
408 env->stats.x = x;
409 env->stats.l = l;
410 env->stats.y = y;
411 env->stats.k = k;
412 env->stats.t = t;
413 break;
414 case REX_BACK:
415 if (++env->stats.b <= 0)
416 return 1;
417 break;
418 case REX_CLASS:
419 case REX_COLL_CLASS:
420 case REX_DOT:
421 case REX_ONECHAR:
422 n = env->stats.m;
423 if ((env->stats.m += e->lo) < n)
424 return 1;
425 if (e->hi != RE_DUP_INF)
426 {
427 n = env->stats.n;
428 if ((env->stats.n += e->hi) < n)
429 return 1;
430 }
431 if (e->lo != e->hi)
432 {
433 if (++env->stats.c <= 0)
434 return 1;
435 if (++env->stats.s <= 0)
436 return 1;
437 }
438 break;
439 case REX_CONJ:
440 cm = env->stats.m;
441 env->stats.m = 0;
442 cn = env->stats.n;
443 env->stats.n = 0;
444 if (stats(env, e->re.group.expr.binary.left))
445 return 1;
446 nm = env->stats.m;
447 env->stats.m = 0;
448 nn = env->stats.n;
449 env->stats.n = 0;
450 if (stats(env, e->re.group.expr.binary.right))
451 return 1;
452 if (env->stats.m < nm)
453 env->stats.m = nm;
454 else
455 nm = env->stats.m;
456 if ((env->stats.m += cm) < nm)
457 return 1;
458 if (env->stats.n < nn)
459 env->stats.n = nn;
460 else
461 nn = env->stats.n;
462 if ((env->stats.n += cn) < nn)
463 return 1;
464 break;
465 case REX_END:
466 env->stats.e = 1;
467 break;
468 case REX_GROUP:
469 if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0)
470 return 1;
471 if (stats(env, e->re.group.expr.rex))
472 return 1;
473 break;
474 case REX_GROUP_AHEAD:
475 case REX_GROUP_AHEAD_NOT:
476 case REX_GROUP_BEHIND:
477 case REX_GROUP_BEHIND_NOT:
478 m = env->stats.m;
479 n = env->stats.n;
480 x = env->stats.x;
481 y = env->stats.y;
482 if (stats(env, e->re.group.expr.rex))
483 return 1;
484 env->stats.m = m;
485 env->stats.n = n;
486 env->stats.x = x;
487 env->stats.y = y;
488 switch (e->type)
489 {
490 case REX_GROUP_AHEAD:
491 case REX_GROUP_BEHIND:
492 if (++env->stats.u <= 0)
493 return 1;
494 break;
495 }
496 break;
497 case REX_GROUP_COND:
498 if (++env->stats.u <= 0)
499 return 1;
500 m = env->stats.m;
501 n = env->stats.n;
502 x = env->stats.x;
503 y = env->stats.y;
504 if (e->re.group.size > 0 && ++env->stats.b <= 0)
505 return 1;
506 if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left))
507 return 1;
508 if (q = e->re.group.expr.binary.right)
509 {
510 if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left))
511 return 1;
512 if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right))
513 return 1;
514 }
515 env->stats.m = m;
516 env->stats.n = n;
517 env->stats.x = x;
518 env->stats.y = y;
519 break;
520 case REX_GROUP_CUT:
521 if (++env->stats.u <= 0)
522 return 1;
523 m = env->stats.m;
524 n = env->stats.n;
525 x = env->stats.x;
526 y = env->stats.y;
527 if (stats(env, e->re.group.expr.rex))
528 return 1;
529 env->stats.m = m;
530 env->stats.n = n;
531 env->stats.x = x;
532 env->stats.y = y;
533 break;
534 case REX_NEG:
535 env->stats.i++;
536 x = env->stats.x;
537 l = env->stats.l;
538 y = env->stats.y;
539 k = env->stats.k;
540 t = env->stats.t;
541 cm = env->stats.m;
542 env->stats.m = 0;
543 if (stats(env, e->re.group.expr.rex))
544 return 1;
545 env->stats.m = !env->stats.m;
546 if ((env->stats.m += cm) < cm)
547 return 1;
548 env->stats.x = x;
549 env->stats.l = l;
550 env->stats.y = y;
551 env->stats.k = k;
552 env->stats.t = t;
553 break;
554 case REX_REP:
555 x = env->stats.x;
556 l = env->stats.l;
557 y = env->stats.y;
558 k = env->stats.k;
559 t = env->stats.t;
560 if (++env->stats.c <= 0)
561 return 1;
562 b = env->stats.b;
563 c = env->stats.c;
564 cm = env->stats.m;
565 env->stats.m = 0;
566 if (stats(env, e->re.group.expr.rex))
567 return 1;
568 if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0)
569 return 1;
570 if (e->lo < 1)
571 {
572 env->stats.x = x;
573 env->stats.l = l;
574 env->stats.y = y;
575 env->stats.k = k;
576 env->stats.t = t;
577 env->stats.m = cm;
578 }
579 else
580 {
581 m = env->stats.m;
582 if ((env->stats.m *= e->lo) > 0 && env->stats.m < m)
583 return 1;
584 m = env->stats.m;
585 if ((env->stats.m += cm) < m)
586 return 1;
587 if (env->stats.x != x)
588 env->stats.l = cm;
589 if (env->stats.y != y)
590 env->stats.k = cm;
591 }
592 break;
593 case REX_STRING:
594 if (!e->map)
595 {
596 cm = env->stats.m;
597 if ((env->stats.m += e->re.string.size) < cm)
598 return 1;
599 cn = env->stats.n;
600 if ((env->stats.n += e->re.string.size) < cn)
601 return 1;
602 if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size)
603 {
604 env->stats.x = e;
605 env->stats.l = cm;
606 }
607 }
608 break;
609 case REX_TRIE:
610 if (++env->stats.s <= 0)
611 return 1;
612 cm = env->stats.m;
613 if ((env->stats.m += e->re.trie.min) < cm)
614 return 1;
615 cn = env->stats.n;
616 if ((env->stats.n += e->re.trie.max) < cn)
617 return 1;
618 env->stats.t++;
619 if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min)
620 {
621 env->stats.y = e;
622 env->stats.k = cm;
623 }
624 break;
625 }
626 } while (e = e->next);
627 return 0;
628 }
629
630 static int token(Cenv_t*);
631
632 static int
magic(register Cenv_t * env,register int c,int escaped)633 magic(register Cenv_t* env, register int c, int escaped)
634 {
635 register char* sp;
636 register int n;
637 int o = c;
638 int e = env->error;
639 int l = env->token.len;
640 short* mp;
641 char* ep;
642
643 if (mp = state.magic[c])
644 {
645 c = mp[env->type+escaped];
646 if (c >= T_META)
647 {
648 sp = (char*)env->cursor + env->token.len;
649 switch (c)
650 {
651 case T_LEFT:
652 n = 0;
653 ep = sp;
654 while (*sp >= '0' && *sp <= '9')
655 {
656 if (n > (INT_MAX / 10))
657 {
658 env->error = REG_BADBR;
659 goto bad;
660 }
661 n = n * 10 + *sp++ - '0';
662 }
663 if (sp == ep)
664 {
665 if (env->type < SRE || *sp != ',')
666 {
667 env->error = *sp ? REG_BADBR : REG_EBRACE;
668 goto bad;
669 }
670 }
671 else if (n > RE_DUP_MAX)
672 {
673 env->error = REG_BADBR;
674 goto bad;
675 }
676 env->token.min = n;
677 if (*sp == ',')
678 {
679 n = 0;
680 ep = ++sp;
681 while (*sp >= '0' && *sp <= '9')
682 {
683 if (n > (INT_MAX / 10))
684 {
685 env->error = REG_BADBR;
686 goto bad;
687 }
688 n = n * 10 + *sp++ - '0';
689 }
690 if (sp == ep)
691 n = RE_DUP_INF;
692 else if (n < env->token.min)
693 {
694 env->error = REG_BADBR;
695 goto bad;
696 }
697 }
698 env->token.max = n;
699 switch (*sp)
700 {
701 case 0:
702 env->error = REG_EBRACE;
703 goto bad;
704 case '\\':
705 if (!escaped)
706 {
707 env->error = REG_BADBR;
708 goto bad;
709 }
710 sp++;
711 break;
712 default:
713 if (escaped)
714 {
715 env->error = REG_BADBR;
716 goto bad;
717 }
718 break;
719 }
720 switch (*sp++)
721 {
722 case 0:
723 env->error = REG_EBRACE;
724 goto bad;
725 case '}':
726 break;
727 default:
728 env->error = REG_BADBR;
729 goto bad;
730 }
731 env->token.len = sp - (char*)env->cursor;
732 break;
733 case T_RIGHT:
734 env->error = REG_EBRACE;
735 goto bad;
736 case T_OPEN:
737 if (env->type < SRE && *sp == '?')
738 {
739 env->token.len++;
740 env->token.lex = 0;
741 goto group;
742 }
743 break;
744 case T_ESCAPE:
745 c = chresc(sp - 2, &ep);
746 if (ep < sp)
747 goto bad;
748 env->token.len += ep - sp;
749 if (c >= T_META)
750 {
751 env->token.lex = c;
752 c = C_ESC;
753 }
754 return c;
755 case T_BACK+0:
756 case T_BACK+1:
757 case T_BACK+2:
758 case T_BACK+3:
759 case T_BACK+4:
760 case T_BACK+5:
761 case T_BACK+6:
762 case T_BACK+7:
763 n = chresc(sp - 2, &ep);
764 if (ep > sp + 1)
765 {
766 env->token.len += ep - sp;
767 return n;
768 }
769 /*FALLTHROUGH*/
770 case T_BACK+8:
771 case T_BACK+9:
772 if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT))
773 {
774 env->error = REG_BADESC;
775 goto bad;
776 }
777 if ((env->flags & REG_MULTIREF) && isdigit(*sp))
778 {
779 c = (c - T_BACK) * 10 + (*sp - '0');
780 if (c > 0 && c <= env->parno && env->paren[c])
781 c += T_BACK;
782 else
783 c = chresc(sp - 2, &ep);
784 env->token.len++;
785 }
786 if (c == T_BACK)
787 c = 0;
788 break;
789 case T_BAD:
790 if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META)
791 return c;
792 goto bad;
793 }
794 if (env->type >= SRE)
795 {
796 if (c == T_DOT)
797 c = '.';
798 else if (c < T_OPEN)
799 {
800 if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(')
801 {
802 env->token.len++;
803 env->token.att = 1;
804 }
805 if (env->type == KRE && *(env->cursor + env->token.len) == '(')
806 {
807 env->token.len++;
808 switch (c)
809 {
810 case T_AT:
811 break;
812 case T_PERCENT:
813 env->token.lex = c;
814 goto group;
815 case T_TILDE:
816 env->token.lex = 0;
817 goto group;
818 default:
819 env->token.lex = c;
820 break;
821 }
822 c = T_OPEN;
823 }
824 else if (c == T_STAR)
825 c = T_DOTSTAR;
826 else if (c == T_QUES)
827 c = T_DOT;
828 else
829 {
830 c = o;
831 env->token.len = l;
832 }
833 }
834 else if (c > T_BACK)
835 {
836 c = (c - T_BACK) * 2 - 1;
837 c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c;
838 }
839 else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP))
840 {
841 if (c == T_AND)
842 c = '&';
843 else if (c == T_BAR)
844 c = '|';
845 else if (c == T_OPEN)
846 c = '(';
847 }
848 }
849 }
850 }
851 else if (escaped == 2)
852 {
853 if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter))
854 /*ok*/;
855 else
856 {
857 env->error = REG_BADESC;
858 goto bad;
859 }
860 }
861 else if (escaped && !(env->flags & REG_LENIENT) && c != ']')
862 {
863 env->error = REG_BADESC;
864 goto bad;
865 }
866 return c;
867 group:
868 sp = (char*)env->cursor + env->token.len;
869 switch (*sp++)
870 {
871 case ')':
872 break;
873 case '#':
874 for (;;)
875 {
876 switch (*sp++)
877 {
878 case 0:
879 env->error = REG_EPAREN;
880 return T_BAD;
881 case ')':
882 break;
883 default:
884 continue;
885 }
886 break;
887 }
888 break;
889 default:
890 return T_GROUP;
891 }
892 env->cursor = (unsigned char*)sp;
893 return token(env);
894 bad:
895 if (escaped == 2)
896 env->error = e;
897 else if (env->flags & REG_LENIENT)
898 return o;
899 else if (escaped == 1 && !env->error)
900 {
901 if (mp || o == ']')
902 return o;
903 env->error = REG_BADESC;
904 }
905 return T_BAD;
906 }
907
908 static int
token(register Cenv_t * env)909 token(register Cenv_t* env)
910 {
911 int c;
912 int posixkludge;
913
914 if (env->token.push)
915 return env->token.lex;
916 env->token.att = env->token.esc = 0;
917 if ((env->token.len = MBSIZE(env->cursor)) > 1)
918 return env->token.lex = C_MB;
919 env->token.lex = 0;
920 for (;;)
921 {
922 c = *env->cursor;
923 if (c == 0 || c == env->delimiter || c == env->terminator)
924 return T_END;
925 if (!(env->flags & REG_COMMENT))
926 break;
927 if (c == '#')
928 {
929 do
930 {
931 c = *++env->cursor;
932 if (c == 0 || c == env->delimiter)
933 return T_END;
934 } while (c != '\n');
935 }
936 else if (!isspace(c))
937 break;
938 env->cursor++;
939 }
940 if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter)
941 {
942 if (env->parnest)
943 {
944 env->error = REG_EPAREN;
945 return T_BAD;
946 }
947 env->parno = 0;
948 env->pattern = env->cursor + 1;
949 return T_BAR;
950 }
951 if (env->flags & REG_LITERAL)
952 return c;
953 if (posixkludge = env->posixkludge)
954 {
955 env->posixkludge = 0;
956 if (c == '*')
957 return c;
958 }
959 if (c == '\\')
960 {
961 if (env->flags & REG_SHELL_ESCAPED)
962 return c;
963 if (!(c = *(env->cursor + 1)) || c == env->terminator)
964 {
965 if (env->flags & REG_LENIENT)
966 {
967 if (c)
968 {
969 env->token.esc = env->token.len;
970 env->token.len += MBSIZE(env->cursor + 1);
971 return c;
972 }
973 return '\\';
974 }
975 env->error = REG_EESCAPE;
976 return T_BAD;
977 }
978 env->token.esc = env->token.len;
979 env->token.len += MBSIZE(env->cursor + 1);
980 if (env->delimiter && c == 'n')
981 return '\n';
982 else if (c == env->delimiter)
983 return magic(env, c, 0);
984 else if (c == '(' && env->type == BRE)
985 env->posixkludge = 1;
986 else if (c == ')' && env->type == BRE && env->parnest <= 0)
987 {
988 env->error = REG_EPAREN;
989 return T_BAD;
990 }
991 else if (isspace(c) && (env->flags & REG_COMMENT))
992 return c;
993 return magic(env, c, 1);
994 }
995 else if (c == '$')
996 {
997 if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n')
998 return T_DOLL;
999 }
1000 else if (c == '^')
1001 {
1002 if (env->type == BRE && (env->cursor == env->pattern || posixkludge == 1))
1003 {
1004 env->posixkludge = 2;
1005 return T_CFLX;
1006 }
1007 }
1008 else if (c == ')')
1009 {
1010 if (env->type != BRE && env->parnest <= 0)
1011 return c;
1012 }
1013 else if (c == '/' && env->explicit == env->mappedslash)
1014 {
1015 while (*(env->cursor + env->token.len) == c)
1016 env->token.len++;
1017 return T_SLASHPLUS;
1018 }
1019 return magic(env, c, 0);
1020 }
1021
1022 static Celt_t*
col(Celt_t * ce,int ic,unsigned char * bp,int bw,int bc,unsigned char * ep,int ew,int ec)1023 col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec)
1024 {
1025 register char* s;
1026 register unsigned char* k;
1027 register unsigned char* e;
1028 register int c;
1029 register int cc;
1030 int bt;
1031 int et;
1032 Ckey_t key;
1033
1034 cc = 0;
1035 for (;;)
1036 {
1037 k = key;
1038 if (bw == 1)
1039 {
1040 c = bc;
1041 if (ic)
1042 {
1043 if (isupper(c))
1044 {
1045 c = tolower(c);
1046 cc = -1;
1047 }
1048 else if (islower(c))
1049 {
1050 c = toupper(c);
1051 cc = -1;
1052 }
1053 }
1054 *k++ = c;
1055 }
1056 else if (bw < COLL_KEY_MAX)
1057 {
1058 s = (char*)bp;
1059 if (ic)
1060 {
1061 c = mbchar(s);
1062 if (iswupper(c))
1063 {
1064 c = towlower(c);
1065 cc = 1;
1066 }
1067 else if (iswlower(c))
1068 {
1069 c = towupper(c);
1070 cc = 1;
1071 }
1072 }
1073 if (cc > 0)
1074 {
1075 cc = -1;
1076 k += mbconv((char*)k, c);
1077 }
1078 else
1079 for (e = k + bw; k < e; *k++ = *s++);
1080 }
1081 *k = 0;
1082 mbxfrm(ce->beg, key, COLL_KEY_MAX);
1083 if (ep)
1084 {
1085 k = key;
1086 c = mbchar(k);
1087 if (iswupper(c))
1088 bt = COLL_range_uc;
1089 else if (iswlower(c))
1090 bt = COLL_range_lc;
1091 else
1092 bt = COLL_range;
1093 k = key;
1094 if (ew == 1)
1095 {
1096 c = ec;
1097 if (ic)
1098 {
1099 if (isupper(c))
1100 {
1101 c = tolower(c);
1102 cc = -1;
1103 }
1104 else if (islower(c))
1105 {
1106 c = toupper(c);
1107 cc = -1;
1108 }
1109 }
1110 *k++ = c;
1111 }
1112 else if (ew < COLL_KEY_MAX)
1113 {
1114 s = (char*)ep;
1115 if (ic)
1116 {
1117 c = mbchar(s);
1118 if (iswupper(c))
1119 {
1120 c = towlower(c);
1121 cc = 1;
1122 }
1123 else if (iswlower(c))
1124 {
1125 c = towupper(c);
1126 cc = 1;
1127 }
1128 }
1129 if (cc > 0)
1130 {
1131 cc = -1;
1132 k += mbconv((char*)k, c);
1133 }
1134 else
1135 for (e = k + ew; k < e; *k++ = *s++);
1136 }
1137 *k = 0;
1138 mbxfrm(ce->end, key, COLL_KEY_MAX);
1139 k = key;
1140 c = mbchar(k);
1141 if (iswupper(c))
1142 et = COLL_range_uc;
1143 else if (iswlower(c))
1144 et = COLL_range_lc;
1145 else
1146 et = COLL_range;
1147 ce->typ = bt == et ? bt : COLL_range;
1148 }
1149 else
1150 ce->typ = COLL_char;
1151 ce++;
1152 if (!ic || !cc)
1153 break;
1154 ic = 0;
1155 }
1156 return ce;
1157 }
1158
1159 static Rex_t*
bra(Cenv_t * env)1160 bra(Cenv_t* env)
1161 {
1162 Rex_t* e;
1163 int c;
1164 int i;
1165 int w;
1166 int neg;
1167 int last;
1168 int inrange;
1169 int complicated;
1170 int collate;
1171 int elements;
1172 unsigned char* first;
1173 unsigned char* start;
1174 unsigned char* begin;
1175 unsigned char* s;
1176 regclass_t f;
1177 unsigned char buf[4 * (COLL_KEY_MAX + 1)];
1178 #if _PACKAGE_ast
1179 int ic;
1180 char mbc[COLL_KEY_MAX + 1];
1181 #endif
1182
1183 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
1184 return 0;
1185 collate = complicated = elements = 0;
1186 if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!')
1187 {
1188 env->cursor++;
1189 neg = 1;
1190 }
1191 else
1192 neg = 0;
1193 first = env->cursor;
1194 start = first + MBSIZE(first);
1195 if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter))
1196 goto error;
1197 begin = env->cursor + MBSIZE(env->cursor);
1198
1199 /*
1200 * inrange: 0=no, 1=possibly, 2=definitely
1201 */
1202
1203 inrange = 0;
1204 for (;;)
1205 {
1206 if (!(c = *env->cursor) || c == env->terminator || c == env->delimiter && (env->flags & REG_ESCAPE))
1207 goto error;
1208 env->cursor += (w = MBSIZE(env->cursor));
1209 if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
1210 {
1211 if (*env->cursor)
1212 {
1213 if (*env->cursor == 'n')
1214 {
1215 env->cursor++;
1216 c = '\n';
1217 }
1218 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
1219 {
1220 env->token.len = 1;
1221 w = magic(env, *env->cursor, 2);
1222 if (env->token.len > 1 || w != T_BAD)
1223 {
1224 if (env->token.len == 1 && (f = classfun(w)))
1225 {
1226 if (inrange > 1)
1227 {
1228 if (env->type < SRE && !(env->flags & REG_LENIENT))
1229 goto erange;
1230 inrange = 0;
1231 }
1232 env->cursor++;
1233 for (c = 0; c <= UCHAR_MAX; c++)
1234 if ((*f)(c))
1235 setadd(e->re.charclass, c);
1236 complicated++;
1237 elements++;
1238 continue;
1239 }
1240 if (env->token.len > 1 || w >= 0 && w < T_META)
1241 {
1242 c = w;
1243 if (c > UCHAR_MAX)
1244 {
1245 if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide())
1246 goto erange;
1247 c = UCHAR_MAX;
1248 }
1249 env->cursor += env->token.len;
1250 }
1251 }
1252 }
1253 }
1254 }
1255 else if (c == ']')
1256 {
1257 if (env->cursor == begin)
1258 {
1259 last = c;
1260 inrange = 1;
1261 continue;
1262 }
1263 if (inrange != 0)
1264 {
1265 setadd(e->re.charclass, last);
1266 elements++;
1267 if (inrange == 2)
1268 {
1269 setadd(e->re.charclass, '-');
1270 elements++;
1271 }
1272 }
1273 break;
1274 }
1275 else if (c == '-')
1276 {
1277 if (!inrange && env->cursor != begin && *env->cursor != ']')
1278 {
1279 if (env->type < SRE && !(env->flags & REG_LENIENT))
1280 goto erange;
1281 continue;
1282 }
1283 else if (inrange == 1)
1284 {
1285 inrange = 2;
1286 complicated++;
1287 continue;
1288 }
1289 }
1290 else if (c == '[')
1291 {
1292 switch (*env->cursor)
1293 {
1294 case 0:
1295 goto error;
1296 case ':':
1297 if (env->regexp)
1298 goto normal;
1299 if (inrange == 1)
1300 {
1301 setadd(e->re.charclass, last);
1302 elements++;
1303 }
1304 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
1305 {
1306 if (env->cursor == start && (c = *(env->cursor + 1)))
1307 {
1308 s = start = env->cursor + 1;
1309 while (*++s && *s != ':');
1310 if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']')
1311 {
1312 if ((i = (s - start)) == 1)
1313 {
1314 switch (c)
1315 {
1316 case '<':
1317 i = REX_WBEG;
1318 break;
1319 case '>':
1320 i = REX_WEND;
1321 break;
1322 default:
1323 i = 0;
1324 break;
1325 }
1326 if (i)
1327 {
1328 env->cursor = s + 3;
1329 drop(env->disc, e);
1330 return node(env, i, 0, 0, 0);
1331 }
1332 }
1333 }
1334 }
1335 env->error = REG_ECTYPE;
1336 goto error;
1337 }
1338 for (c = 0; c <= UCHAR_MAX; c++)
1339 if ((*f)(c))
1340 setadd(e->re.charclass, c);
1341 inrange = 0;
1342 complicated++;
1343 elements++;
1344 continue;
1345 case '=':
1346 if (env->regexp)
1347 goto normal;
1348 if (inrange == 2)
1349 goto erange;
1350 if (inrange == 1)
1351 {
1352 setadd(e->re.charclass, last);
1353 elements++;
1354 }
1355 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0)
1356 goto ecollate;
1357 if (c > 1)
1358 collate++;
1359 else
1360 setadd(e->re.charclass, buf[0]);
1361 c = buf[0];
1362 inrange = 0;
1363 complicated++;
1364 elements++;
1365 continue;
1366 case '.':
1367 if (env->regexp)
1368 goto normal;
1369 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0)
1370 goto ecollate;
1371 if (c > 1)
1372 collate++;
1373 c = buf[0];
1374 complicated++;
1375 break;
1376 default:
1377 normal:
1378 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
1379 goto error;
1380 break;
1381 }
1382 }
1383 else if (w > 1)
1384 complicated++;
1385 if (inrange == 2)
1386 {
1387 if (last <= c)
1388 {
1389 for (i = last; i <= c; i++)
1390 setadd(e->re.charclass, i);
1391 inrange = env->type >= SRE || (env->flags & REG_LENIENT);
1392 elements += 2;
1393 }
1394 else if (env->type >= SRE)
1395 {
1396 setadd(e->re.charclass, last);
1397 setadd(e->re.charclass, c);
1398 elements += 2;
1399 inrange = 1;
1400 }
1401 else if (!(env->flags & REG_LENIENT))
1402 goto erange;
1403 else
1404 inrange = 0;
1405 }
1406 else if (inrange == 1)
1407 {
1408 setadd(e->re.charclass, last);
1409 elements++;
1410 }
1411 else
1412 inrange = 1;
1413 last = c;
1414 }
1415 #if _PACKAGE_ast
1416 if (complicated && mbcoll())
1417 {
1418 Dt_t* dt;
1419 Cchr_t* cc;
1420 Cchr_t* tc;
1421 Cchr_t* xc;
1422 Celt_t* ce;
1423 Cchr_t key;
1424 int rw;
1425 int rc;
1426 int wc;
1427 unsigned char* rp;
1428 unsigned char* pp;
1429 char* wp;
1430 char cb[2][COLL_KEY_MAX+1];
1431
1432 static Dtdisc_t disc;
1433
1434 static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
1435
1436 if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data))
1437 {
1438 disc.key = offsetof(Cchr_t, key);
1439 if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dttree)))
1440 {
1441 for (i = 0; i < elementsof(primary) - 1; i++, cc++)
1442 {
1443 cc->nam[0] = primary[i];
1444 mbxfrm(cc->key, cc->nam, COLL_KEY_MAX);
1445 dtinsert(dt, cc);
1446 }
1447 for (i = 0; i < elementsof(cc->key); i++)
1448 cc->key[i] = ~0;
1449 dtinsert(dt, cc);
1450 LCINFO(AST_LC_COLLATE)->data = (void*)dt;
1451 }
1452 else
1453 {
1454 if (cc)
1455 free(cc);
1456 drop(env->disc, e);
1457 return 0;
1458 }
1459 }
1460 if (dt)
1461 {
1462 drop(env->disc, e);
1463 if (ic = env->flags & REG_ICASE)
1464 elements *= 2;
1465 if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 2) * sizeof(Celt_t))))
1466 return 0;
1467 ce = (Celt_t*)e->re.data;
1468 e->re.collate.invert = neg;
1469 e->re.collate.elements = ce;
1470 env->cursor = first;
1471 inrange = 0;
1472 for (;;)
1473 {
1474 if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter)
1475 goto error;
1476 pp = env->cursor;
1477 env->cursor += (w = MBSIZE(env->cursor));
1478 if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
1479 {
1480 if (*env->cursor)
1481 {
1482 if (*env->cursor == 'n')
1483 {
1484 pp = env->cursor++;
1485 c = '\n';
1486 }
1487 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
1488 {
1489 env->token.len = 1;
1490 w = magic(env, *env->cursor, 2);
1491 if (env->token.len > 1 || w != T_BAD)
1492 {
1493 if (env->token.len == 1 && (f = classfun(w)))
1494 {
1495 if (inrange > 1)
1496 {
1497 if (env->type < SRE && !(env->flags & REG_LENIENT))
1498 goto erange;
1499 inrange = 0;
1500 }
1501 env->cursor++;
1502 ce->fun = f;
1503 ce->typ = COLL_call;
1504 ce++;
1505 continue;
1506 }
1507 if (env->token.len > 1 || w >= 0 && w < T_META)
1508 {
1509 c = w;
1510 w = mbconv(mbc, c);
1511 pp = (unsigned char*)mbc;
1512 env->cursor += env->token.len;
1513 }
1514 }
1515 }
1516 }
1517 }
1518 else if (c == ']')
1519 {
1520 if (env->cursor == begin)
1521 {
1522 rp = pp;
1523 rw = w;
1524 inrange = 1;
1525 continue;
1526 }
1527 if (inrange != 0)
1528 {
1529 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1530 if (inrange == 2)
1531 ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0);
1532 }
1533 break;
1534 }
1535 else if (c == '-')
1536 {
1537 if (!inrange && env->cursor != begin && *env->cursor != ']')
1538 {
1539 if (env->type < SRE && !(env->flags & REG_LENIENT))
1540 goto erange;
1541 continue;
1542 }
1543 else if (inrange == 1)
1544 {
1545 inrange = 2;
1546 continue;
1547 }
1548 }
1549 else if (c == '[')
1550 {
1551 switch (*env->cursor)
1552 {
1553 case 0:
1554 goto error;
1555 case ':':
1556 if (env->regexp)
1557 goto complicated_normal;
1558 if (inrange == 1)
1559 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1560 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
1561 {
1562 if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']')
1563 {
1564 switch (c)
1565 {
1566 case '<':
1567 i = REX_WBEG;
1568 break;
1569 case '>':
1570 i = REX_WEND;
1571 break;
1572 default:
1573 i = 0;
1574 break;
1575 }
1576 if (i)
1577 {
1578 env->cursor += 5;
1579 drop(env->disc, e);
1580 return node(env, i, 0, 0, 0);
1581 }
1582 }
1583 env->error = REG_ECTYPE;
1584 goto error;
1585 }
1586 ce->fun = f;
1587 ce->typ = COLL_call;
1588 ce++;
1589 inrange = 0;
1590 continue;
1591 case '=':
1592 if (env->regexp)
1593 goto complicated_normal;
1594 if (inrange == 2)
1595 goto erange;
1596 if (inrange == 1)
1597 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1598 pp = (unsigned char*)cb[inrange];
1599 rp = env->cursor + 1;
1600 if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0)
1601 goto ecollate;
1602 wp = (char*)pp;
1603 wc = mbchar(wp);
1604 c = 0;
1605 if (ic)
1606 {
1607 if (iswupper(wc))
1608 {
1609 wc = towlower(wc);
1610 rw = mbconv((char*)pp, wc);
1611 c = 'u';
1612 }
1613 else if (iswlower(wc))
1614 c = 'l';
1615 }
1616 for (;;)
1617 {
1618 mbxfrm(key.key, (char*)pp, COLL_KEY_MAX);
1619 if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key)))
1620 goto ecollate;
1621 xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc;
1622 if (c == 'l' || c == 'L' && !(c = 0))
1623 ce->typ = COLL_range_lc;
1624 else if (c == 'u' || c == 'U' && !(c = 0))
1625 ce->typ = COLL_range_uc;
1626 else
1627 ce->typ = COLL_range;
1628 strcpy((char*)ce->beg, (char*)xc->key);
1629 if (!(cc = (Cchr_t*)dtnext(dt, cc)))
1630 goto ecollate;
1631 if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc)))
1632 cc = tc;
1633 strcpy((char*)ce->end, (char*)cc->key);
1634 ce->max = -1;
1635 ce++;
1636 if (!c)
1637 break;
1638 if (c == 'u')
1639 {
1640 wc = towlower(wc);
1641 c = 'L';
1642 }
1643 else
1644 {
1645 wc = towupper(wc);
1646 c = 'U';
1647 }
1648 rw = mbconv((char*)pp, wc);
1649 }
1650 inrange = 0;
1651 c = *pp;
1652 continue;
1653 case '.':
1654 if (env->regexp)
1655 goto complicated_normal;
1656 pp = (unsigned char*)cb[inrange];
1657 if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0)
1658 goto ecollate;
1659 c = buf[0];
1660 break;
1661 default:
1662 complicated_normal:
1663 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
1664 goto error;
1665 break;
1666 }
1667 }
1668 if (inrange == 2)
1669 {
1670 ce = col(ce, ic, rp, rw, rc, pp, w, c);
1671 if (strcmp((char*)ce->beg, (char*)ce->end) > 0)
1672 {
1673 if (env->type < SRE && !(env->flags & REG_LENIENT))
1674 goto erange;
1675 (ce-1)->typ = COLL_char;
1676 strcpy((char*)ce->beg, (char*)(ce-1)->end);
1677 ce->typ = COLL_char;
1678 ce++;
1679 }
1680 inrange = env->type >= SRE || (env->flags & REG_LENIENT);
1681 }
1682 else if (inrange == 1)
1683 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1684 else
1685 inrange = 1;
1686 rp = pp;
1687 rw = w;
1688 rc = c;
1689 }
1690 ce->typ = COLL_end;
1691 return e;
1692 }
1693 }
1694 #endif
1695 if (collate)
1696 goto ecollate;
1697 if (env->flags & REG_ICASE)
1698 for (i = 0; i <= UCHAR_MAX; i++)
1699 if (settst(e->re.charclass, i))
1700 {
1701 if (isupper(i))
1702 c = tolower(i);
1703 else if (islower(i))
1704 c = toupper(i);
1705 else
1706 continue;
1707 setadd(e->re.charclass, c);
1708 }
1709 if (neg)
1710 {
1711 for (i = 0; i < elementsof(e->re.charclass->bits); i++)
1712 e->re.charclass->bits[i] ^= ~0;
1713 if (env->explicit >= 0)
1714 setclr(e->re.charclass, env->explicit);
1715 }
1716 return e;
1717 ecollate:
1718 env->error = REG_ECOLLATE;
1719 goto error;
1720 erange:
1721 env->error = REG_ERANGE;
1722 error:
1723 drop(env->disc, e);
1724 if (!env->error)
1725 env->error = REG_EBRACK;
1726 return 0;
1727 }
1728
1729 static Rex_t*
ccl(Cenv_t * env,int type)1730 ccl(Cenv_t* env, int type)
1731 {
1732 int i;
1733 Rex_t* e;
1734 Celt_t* ce;
1735 regclass_t f;
1736
1737 if (!(f = classfun(type)))
1738 {
1739 env->error = REG_BADESC;
1740 return 0;
1741 }
1742 if (!mbcoll())
1743 {
1744 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
1745 return 0;
1746 for (i = 0; i <= UCHAR_MAX; i++)
1747 if ((*f)(i))
1748 setadd(e->re.charclass, i);
1749 if (env->explicit >= 0)
1750 setclr(e->re.charclass, env->explicit);
1751 }
1752 else
1753 {
1754 if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t))))
1755 return 0;
1756 ce = (Celt_t*)e->re.data;
1757 e->re.collate.invert = 0;
1758 e->re.collate.elements = ce;
1759 ce->fun = f;
1760 ce->typ = COLL_call;
1761 ce++;
1762 ce->typ = COLL_end;
1763 }
1764 return e;
1765 }
1766
1767 static Rex_t*
rep(Cenv_t * env,Rex_t * e,int number,int last)1768 rep(Cenv_t* env, Rex_t* e, int number, int last)
1769 {
1770 Rex_t* f;
1771 unsigned long m = 0;
1772 unsigned long n = RE_DUP_INF;
1773 int minimal = -1;
1774
1775 if (!e)
1776 return 0;
1777 switch (token(env))
1778 {
1779 case T_BANG:
1780 eat(env);
1781 if (!(f = node(env, REX_NEG, m, n, 0)))
1782 {
1783 drop(env->disc, e);
1784 return 0;
1785 }
1786 f->re.group.expr.rex = e;
1787 return f;
1788 case T_QUES:
1789 eat(env);
1790 n = 1;
1791 break;
1792 case T_STAR:
1793 eat(env);
1794 break;
1795 case T_PLUS:
1796 eat(env);
1797 m = 1;
1798 break;
1799 case T_LEFT:
1800 eat(env);
1801 m = env->token.min;
1802 n = env->token.max;
1803 break;
1804 default:
1805 return e;
1806 }
1807 if (env->token.att)
1808 minimal = 1;
1809 else if (env->type < SRE)
1810 switch (token(env))
1811 {
1812 case T_QUES:
1813 eat(env);
1814 minimal = !(env->flags & REG_MINIMAL);
1815 break;
1816 case T_STAR: /*AST*/
1817 eat(env);
1818 minimal = !!(env->flags & REG_MINIMAL);
1819 break;
1820 }
1821 switch (e->type)
1822 {
1823 case REX_DOT:
1824 case REX_CLASS:
1825 case REX_COLL_CLASS:
1826 case REX_ONECHAR:
1827 e->lo = m;
1828 e->hi = n;
1829 if (minimal >= 0)
1830 mark(e, minimal);
1831 return e;
1832 #if HUH_2002_08_07
1833 case REX_BEG:
1834 #endif
1835 case REX_BEG_STR:
1836 case REX_END_STR:
1837 case REX_FIN_STR:
1838 case REX_WBEG:
1839 case REX_WEND:
1840 case REX_WORD:
1841 case REX_WORD_NOT:
1842 env->error = REG_BADRPT;
1843 drop(env->disc, e);
1844 return 0;
1845 }
1846 if (m == 1 && n == 1)
1847 {
1848 if (minimal >= 0)
1849 mark(e, minimal);
1850 return e;
1851 }
1852 if (!(f = node(env, REX_REP, m, n, 0)))
1853 {
1854 drop(env->disc, e);
1855 return 0;
1856 }
1857 f->re.group.expr.rex = e;
1858 f->re.group.number = number;
1859 f->re.group.last = last;
1860 if (minimal >= 0)
1861 mark(f, minimal);
1862 if (m <= n && n)
1863 {
1864 for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex);
1865 if (e && e->type == REX_NEG)
1866 f->type = REX_GROUP;
1867 }
1868 return f;
1869 }
1870
1871 static int
isstring(Rex_t * e)1872 isstring(Rex_t* e)
1873 {
1874 switch (e->type)
1875 {
1876 case REX_ONECHAR:
1877 return e->lo == 1 && e->hi == 1;
1878 case REX_STRING:
1879 return 1;
1880 }
1881 return 0;
1882 }
1883
1884 static Trie_node_t*
trienode(Cenv_t * env,int c)1885 trienode(Cenv_t* env, int c)
1886 {
1887 Trie_node_t* t;
1888
1889 if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t)))
1890 {
1891 memset(t, 0, sizeof(Trie_node_t));
1892 t->c = c;
1893 }
1894 return t;
1895 }
1896
1897 static int
insert(Cenv_t * env,Rex_t * f,Rex_t * g)1898 insert(Cenv_t* env, Rex_t* f, Rex_t* g)
1899 {
1900 unsigned char* s;
1901 unsigned char* e;
1902 Trie_node_t* t;
1903 int len;
1904 unsigned char tmp[2];
1905
1906 switch (f->type)
1907 {
1908 case REX_ONECHAR:
1909 *(s = tmp) = f->re.onechar;
1910 e = s + 1;
1911 break;
1912 case REX_STRING:
1913 s = f->re.string.base;
1914 e = s + f->re.string.size;
1915 break;
1916 default:
1917 return 1;
1918 }
1919 if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s)))
1920 return 1;
1921 for (len = 1;;)
1922 {
1923 if (t->c == *s)
1924 {
1925 if (++s >= e)
1926 break;
1927 if (!t->son && !(t->son = trienode(env, *s)))
1928 return 1;
1929 t = t->son;
1930 len++;
1931 }
1932 else
1933 {
1934 if (!t->sib && !(t->sib = trienode(env, *s)))
1935 return 1;
1936 t = t->sib;
1937 }
1938 }
1939 if (g->re.trie.min > len)
1940 g->re.trie.min = len;
1941 if (g->re.trie.max < len)
1942 g->re.trie.max = len;
1943 t->end = 1;
1944 return 0;
1945 }
1946
1947 /*
1948 * trie() tries to combine nontrivial e and f into a REX_TRIE
1949 * unless 0 is returned, e and f are deleted as far as possible
1950 * also called by regcomb
1951 */
1952
1953 static Rex_t*
trie(Cenv_t * env,Rex_t * e,Rex_t * f)1954 trie(Cenv_t* env, Rex_t* e, Rex_t* f)
1955 {
1956 Rex_t* g;
1957
1958 if (e->next || f->next || !isstring(e) || e->flags != f->flags)
1959 return 0;
1960 if (isstring(f))
1961 {
1962 if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*))))
1963 return 0;
1964 g->re.trie.min = INT_MAX;
1965 if (insert(env, f, g))
1966 goto nospace;
1967 drop(env->disc, f);
1968 }
1969 else if (f->type != REX_TRIE)
1970 return 0;
1971 else
1972 g = f;
1973 if (insert(env, e, g))
1974 goto nospace;
1975 drop(env->disc, e);
1976 return g;
1977 nospace:
1978 if (g != f)
1979 drop(env->disc, g);
1980 return 0;
1981 }
1982
1983 static Rex_t* alt(Cenv_t*, int, int);
1984
1985 static int
chr(register Cenv_t * env,int * escaped)1986 chr(register Cenv_t* env, int* escaped)
1987 {
1988 unsigned char* p;
1989 int c;
1990
1991 *escaped = 0;
1992 if (!(c = *env->cursor))
1993 return -1;
1994 env->cursor++;
1995 if (c == '\\')
1996 {
1997 if (env->flags & REG_SHELL_ESCAPED)
1998 return c;
1999 if (!(c = *(env->cursor + 1)) || c == env->terminator)
2000 {
2001 if (env->flags & REG_LENIENT)
2002 return c ? c : '\\';
2003 env->error = REG_EESCAPE;
2004 return -1;
2005 }
2006 p = env->cursor;
2007 c = chresc((char*)env->cursor - 1, (char**)&env->cursor);
2008 *escaped = env->cursor - p;
2009 }
2010 return c;
2011 }
2012
2013 /*
2014 * open the perly gates
2015 */
2016
2017 static Rex_t*
grp(Cenv_t * env,int parno)2018 grp(Cenv_t* env, int parno)
2019 {
2020 Rex_t* e;
2021 Rex_t* f;
2022 int c;
2023 int i;
2024 int n;
2025 int x;
2026 int esc;
2027 int typ;
2028 int beg;
2029 unsigned char* p;
2030
2031 beg = env->pattern == env->cursor - env->token.len;
2032 if (!(c = env->token.lex) && (c = *env->cursor))
2033 env->cursor++;
2034 env->token.len = 0;
2035 env->parnest++;
2036 typ = -1;
2037 switch (c)
2038 {
2039 case '-':
2040 case '+':
2041 case 'a':
2042 case 'g':
2043 case 'i':
2044 case 'l':
2045 case 'm':
2046 case 'p':
2047 case 'r':
2048 case 's':
2049 case 'x':
2050 case 'A':
2051 case 'B':
2052 case 'E':
2053 case 'F':
2054 case 'G':
2055 case 'I':
2056 case 'K':
2057 case 'L':
2058 case 'M': /* glob(3) */
2059 case 'N': /* glob(3) */
2060 case 'O': /* glob(3) */
2061 case 'P':
2062 case 'R': /* pcre */
2063 case 'S':
2064 case 'U': /* pcre */
2065 case 'X': /* pcre */
2066 x = REX_GROUP;
2067 i = 1;
2068 env->token.push = 1;
2069 for (;;)
2070 {
2071 switch (c)
2072 {
2073 case ')':
2074 if (!(env->flags & REG_LITERAL))
2075 {
2076 env->error = REG_BADRPT;
2077 return 0;
2078 }
2079 /*FALLTHROUGH*/
2080 case 0:
2081 case T_CLOSE:
2082 x = 0;
2083 goto done;
2084 case ':':
2085 eat(env);
2086 if (token(env) == T_CLOSE)
2087 x = 0;
2088 goto done;
2089 case '-':
2090 i = 0;
2091 break;
2092 case '+':
2093 i = 1;
2094 break;
2095 case 'a':
2096 if (i)
2097 env->flags |= (REG_LEFT|REG_RIGHT);
2098 else
2099 env->flags &= ~(REG_LEFT|REG_RIGHT);
2100 break;
2101 case 'g':
2102 if (i)
2103 env->flags &= ~REG_MINIMAL;
2104 else
2105 env->flags |= REG_MINIMAL;
2106 break;
2107 case 'i':
2108 if (i)
2109 env->flags |= REG_ICASE;
2110 else
2111 env->flags &= ~REG_ICASE;
2112 break;
2113 case 'l':
2114 if (i)
2115 env->flags |= REG_LEFT;
2116 else
2117 env->flags &= ~REG_LEFT;
2118 break;
2119 case 'm':
2120 if (i)
2121 env->flags |= REG_NEWLINE;
2122 else
2123 env->flags &= ~REG_NEWLINE;
2124 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
2125 break;
2126 case 'p':
2127 if (i)
2128 env->flags &= ~REG_LENIENT;
2129 else
2130 env->flags |= REG_LENIENT;
2131 break;
2132 case 'r':
2133 if (i)
2134 env->flags |= REG_RIGHT;
2135 else
2136 env->flags &= ~REG_RIGHT;
2137 break;
2138 case 's':
2139 if (i)
2140 env->flags |= REG_SPAN;
2141 else
2142 env->flags &= ~REG_SPAN;
2143 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
2144 break;
2145 case 'x':
2146 if (i)
2147 env->flags |= REG_COMMENT;
2148 else
2149 env->flags &= ~REG_COMMENT;
2150 break;
2151 case 'A':
2152 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2153 env->flags |= REG_AUGMENTED|REG_EXTENDED;
2154 typ = ARE;
2155 break;
2156 case 'B':
2157 case 'G':
2158 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2159 typ = BRE;
2160 break;
2161 case 'E':
2162 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2163 env->flags |= REG_EXTENDED;
2164 typ = ERE;
2165 break;
2166 case 'F':
2167 case 'L':
2168 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2169 env->flags |= REG_LITERAL;
2170 typ = ERE;
2171 break;
2172 case 'K':
2173 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2174 env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT;
2175 typ = KRE;
2176 break;
2177 case 'M':
2178 /* used by caller to disable glob(3) GLOB_BRACE */
2179 break;
2180 case 'N':
2181 /* used by caller to disable glob(3) GLOB_NOCHECK */
2182 break;
2183 case 'O':
2184 /* used by caller to disable glob(3) GLOB_STARSTAR */
2185 break;
2186 case 'P':
2187 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2188 env->flags |= REG_EXTENDED|REG_CLASS_ESCAPE;
2189 typ = ERE;
2190 break;
2191 case 'S':
2192 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2193 env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT;
2194 typ = SRE;
2195 break;
2196 case 'U': /* PCRE_UNGREEDY */
2197 if (i)
2198 env->flags |= REG_MINIMAL;
2199 else
2200 env->flags &= ~REG_MINIMAL;
2201 break;
2202 case 'X': /* PCRE_EXTRA */
2203 break;
2204 default:
2205 env->error = REG_BADRPT;
2206 return 0;
2207 }
2208 eat(env);
2209 c = token(env);
2210 }
2211 done:
2212 break;
2213 case ':':
2214 x = REX_GROUP;
2215 break;
2216 case '=':
2217 x = REX_GROUP_AHEAD;
2218 break;
2219 case '!':
2220 x = REX_GROUP_AHEAD_NOT;
2221 break;
2222 case '<':
2223 switch (token(env))
2224 {
2225 case '=':
2226 x = REX_GROUP_BEHIND;
2227 break;
2228 case '!':
2229 case T_BANG:
2230 x = REX_GROUP_BEHIND_NOT;
2231 break;
2232 default:
2233 env->error = REG_BADRPT;
2234 return 0;
2235 }
2236 eat(env);
2237 break;
2238 case '>':
2239 x = REX_GROUP_CUT;
2240 break;
2241 case '%':
2242 case T_PERCENT:
2243 e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short));
2244 e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor;
2245 n = 1;
2246 for (;;)
2247 {
2248 switch (i = chr(env, &esc))
2249 {
2250 case -1:
2251 case 0:
2252 invalid:
2253 env->cursor -= esc + 1;
2254 env->error = REG_EPAREN;
2255 return 0;
2256 case 'D':
2257 x = REX_NEST_delimiter;
2258 /*FALLTHROUGH*/
2259 delimiter:
2260 if ((i = chr(env, &esc)) < 0)
2261 goto invalid;
2262 if (e->re.nest.type[i] & ~x)
2263 goto invalid;
2264 e->re.nest.type[i] = x;
2265 continue;
2266 case 'E':
2267 x = REX_NEST_escape;
2268 goto delimiter;
2269 case 'L':
2270 x = REX_NEST_literal;
2271 goto quote;
2272 case 'O':
2273 switch (i = chr(env, &esc))
2274 {
2275 case 'T':
2276 e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator;
2277 break;
2278 default:
2279 goto invalid;
2280 }
2281 continue;
2282 case 'Q':
2283 x = REX_NEST_quote;
2284 /*FALLTHROUGH*/
2285 quote:
2286 if ((i = chr(env, &esc)) < 0)
2287 goto invalid;
2288 if (e->re.nest.type[i] & ~x)
2289 goto invalid;
2290 e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT);
2291 continue;
2292 case 'S':
2293 x = REX_NEST_separator;
2294 goto delimiter;
2295 case 'T':
2296 x = REX_NEST_terminator;
2297 goto delimiter;
2298 case '|':
2299 case '&':
2300 if (!esc)
2301 goto invalid;
2302 goto nesting;
2303 case '(':
2304 if (!esc)
2305 n++;
2306 goto nesting;
2307 case ')':
2308 if (!esc && !--n)
2309 break;
2310 goto nesting;
2311 default:
2312 nesting:
2313 if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
2314 goto invalid;
2315 e->re.nest.type[i] = REX_NEST_open;
2316 if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
2317 goto invalid;
2318 if (!esc)
2319 {
2320 if (x == ')' && !--n)
2321 goto invalid;
2322 else if (x == '(')
2323 n++;
2324 }
2325 e->re.nest.type[x] |= REX_NEST_close;
2326 e->re.nest.type[i] |= x << REX_NEST_SHIFT;
2327 continue;
2328 }
2329 break;
2330 }
2331 env->parnest--;
2332 if (c == T_PERCENT)
2333 for (n = 0; n < 2; n++)
2334 {
2335 parno = ++env->parno;
2336 if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2337 {
2338 drop(env->disc, e);
2339 return 0;
2340 }
2341 if (parno < elementsof(env->paren))
2342 env->paren[parno] = f;
2343 f->re.group.back = 0;
2344 f->re.group.number = parno;
2345 f->re.group.expr.rex = e;
2346 e = f;
2347 }
2348 return e;
2349 case '(':
2350 c = 0;
2351 if (isdigit(*env->cursor))
2352 {
2353 f = 0;
2354 do
2355 {
2356 if (c > (INT_MAX / 10))
2357 {
2358 env->error = REG_BADRPT;
2359 return 0;
2360 }
2361 c = c * 10 + (*env->cursor++ - '0');
2362 } while (isdigit(*env->cursor));
2363 if (*env->cursor++ != ')')
2364 {
2365 env->error = REG_BADRPT;
2366 return 0;
2367 }
2368 if (c && env->type >= SRE)
2369 c = c * 2 - 1;
2370 if (!c || c > env->parno || !env->paren[c])
2371 {
2372 if (!(env->flags & REG_LENIENT))
2373 {
2374 env->error = REG_ESUBREG;
2375 return 0;
2376 }
2377 if (c)
2378 c = -1;
2379 }
2380 }
2381 else
2382 {
2383 if (env->type < SRE && *env->cursor++ != '?')
2384 {
2385 env->error = REG_BADRPT;
2386 return 0;
2387 }
2388 if (!(f = grp(env, parno + 1)) && env->error)
2389 return 0;
2390 }
2391 if (!(e = node(env, REX_GROUP_COND, 0, 0, 0)))
2392 {
2393 drop(env->disc, f);
2394 return 0;
2395 }
2396 e->re.group.size = c;
2397 e->re.group.expr.binary.left = f;
2398 if (!(e->re.group.expr.binary.right = alt(env, parno, 1)))
2399 {
2400 drop(env->disc, e);
2401 return 0;
2402 }
2403 if (token(env) != T_CLOSE)
2404 {
2405 env->error = REG_EPAREN;
2406 return 0;
2407 }
2408 eat(env);
2409 env->parnest--;
2410 return rep(env, e, parno, parno);
2411 case '{':
2412 p = env->cursor;
2413 n = 1;
2414 while (c = *env->cursor)
2415 {
2416 if (c == '\\' && *(env->cursor + 1))
2417 env->cursor++;
2418 else if (c == '{')
2419 n++;
2420 else if (c == '}' && !--n)
2421 break;
2422 else if (c == env->delimiter || c == env->terminator)
2423 break;
2424 env->cursor++;
2425 }
2426 if (c != '}')
2427 {
2428 env->error = REG_EBRACE;
2429 return 0;
2430 }
2431 if (*++env->cursor != ')')
2432 {
2433 env->error = REG_EPAREN;
2434 return 0;
2435 }
2436 env->cursor++;
2437 env->parnest--;
2438 if (env->disc->re_version < REG_VERSION_EXEC)
2439 {
2440 env->error = REG_BADRPT;
2441 return 0;
2442 }
2443 if (!env->disc->re_execf)
2444 return 0;
2445 if (!(e = node(env, REX_EXEC, 0, 0, 0)))
2446 return 0;
2447 e->re.exec.text = (const char*)p;
2448 e->re.exec.size = env->cursor - p - 2;
2449 if (!env->disc->re_compf)
2450 e->re.exec.data = 0;
2451 else
2452 e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc);
2453 return e;
2454 case '0': case '1': case '2': case '3': case '4':
2455 case '5': case '6': case '7': case '8': case '9':
2456 c -= '0';
2457 while (isdigit(*env->cursor))
2458 {
2459 if (c > (INT_MAX / 10))
2460 {
2461 env->error = REG_ESUBREG;
2462 return 0;
2463 }
2464 c = c * 10 + *env->cursor++ - '0';
2465 }
2466 if (*env->cursor == ')')
2467 {
2468 env->cursor++;
2469 env->parnest--;
2470 env->token.len = 1;
2471 if (c > env->parno || !env->paren[c])
2472 {
2473 env->error = REG_ESUBREG;
2474 return 0;
2475 }
2476 env->paren[c]->re.group.back = 1;
2477 return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
2478 }
2479 /*FALLTHROUGH*/
2480 default:
2481 env->error = REG_BADRPT;
2482 return 0;
2483 }
2484 if (x && !(e = alt(env, parno, 0)))
2485 return 0;
2486 c = token(env);
2487 env->parnest--;
2488 if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')'))
2489 {
2490 env->error = REG_EPAREN;
2491 return 0;
2492 }
2493 eat(env);
2494 if (typ >= 0)
2495 {
2496 if (beg)
2497 env->pattern = env->cursor;
2498 env->type = typ;
2499 }
2500 if (!x)
2501 return 0;
2502 if (!(f = node(env, x, 0, 0, 0)))
2503 {
2504 drop(env->disc, e);
2505 return 0;
2506 }
2507 f->re.group.expr.rex = e;
2508 if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT)
2509 {
2510 if (stats(env, e))
2511 {
2512 drop(env->disc, f);
2513 if (!env->error)
2514 env->error = REG_ECOUNT;
2515 return 0;
2516 }
2517 f->re.group.size = env->stats.m;
2518 memset(&env->stats, 0, sizeof(env->stats));
2519 }
2520 switch (x)
2521 {
2522 case REX_GROUP:
2523 case REX_GROUP_CUT:
2524 f = rep(env, f, parno, env->parno);
2525 break;
2526 }
2527 return f;
2528 }
2529
2530 static Rex_t*
seq(Cenv_t * env)2531 seq(Cenv_t* env)
2532 {
2533 Rex_t* e;
2534 Rex_t* f;
2535 Token_t tok;
2536 int c;
2537 int i;
2538 int n;
2539 int x;
2540 int parno;
2541 int type;
2542 regflags_t flags;
2543 unsigned char* s;
2544 unsigned char* p;
2545 unsigned char* t;
2546 unsigned char* u;
2547 unsigned char buf[256];
2548
2549 for (;;)
2550 {
2551 s = buf;
2552 while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len])
2553 {
2554 x = c;
2555 p = env->cursor;
2556 if (c >= 0)
2557 {
2558 n = 1;
2559 *s++ = (env->flags & REG_ICASE) ? toupper(c) : c;
2560 }
2561 else if (c == C_ESC || (env->flags & REG_ICASE))
2562 {
2563 c = (c == C_ESC) ? env->token.lex : mbchar(p);
2564 if (env->flags & REG_ICASE)
2565 c = towupper(c);
2566 if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX)
2567 break;
2568 if ((n = mbconv((char*)s, c)) < 0)
2569 *s++ = c;
2570 else if (n)
2571 s += n;
2572 else
2573 {
2574 n = 1;
2575 *s++ = 0;
2576 }
2577 }
2578 else
2579 {
2580 n = env->token.len - env->token.esc;
2581 for (t = p, u = s + n; s < u; *s++ = *t++);
2582 }
2583 eat(env);
2584 }
2585 if (c == T_BAD)
2586 return 0;
2587 if (s > buf)
2588 switch (c)
2589 {
2590 case T_STAR:
2591 case T_PLUS:
2592 case T_LEFT:
2593 case T_QUES:
2594 case T_BANG:
2595 if ((s -= n) == buf)
2596 e = 0;
2597 else
2598 {
2599 i = s - buf;
2600 if (!(e = node(env, REX_STRING, 0, 0, i)))
2601 return 0;
2602 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i);
2603 e->re.string.size = i;
2604 }
2605 if (x >= 0)
2606 {
2607 if (!(f = node(env, REX_ONECHAR, 1, 1, 0)))
2608 {
2609 drop(env->disc, e);
2610 return 0;
2611 }
2612 f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x;
2613 }
2614 else
2615 {
2616 if (!(f = node(env, REX_STRING, 0, 0, n)))
2617 return 0;
2618 memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n);
2619 f->re.string.size = n;
2620 }
2621 if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env))))
2622 {
2623 drop(env->disc, e);
2624 return 0;
2625 }
2626 if (e)
2627 f = cat(env, e, f);
2628 return f;
2629 default:
2630 c = s - buf;
2631 if (!(e = node(env, REX_STRING, 0, 0, c)))
2632 return 0;
2633 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c);
2634 e->re.string.size = c;
2635 return cat(env, e, seq(env));
2636 }
2637 else if (c > T_BACK)
2638 {
2639 eat(env);
2640 c -= T_BACK;
2641 if (c > env->parno || !env->paren[c])
2642 {
2643 env->error = REG_ESUBREG;
2644 return 0;
2645 }
2646 env->paren[c]->re.group.back = 1;
2647 e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
2648 }
2649 else
2650 switch (c)
2651 {
2652 case T_AND:
2653 case T_CLOSE:
2654 case T_BAR:
2655 case T_END:
2656 return node(env, REX_NULL, 0, 0, 0);
2657 case T_DOLL:
2658 eat(env);
2659 e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0);
2660 break;
2661 case T_CFLX:
2662 eat(env);
2663 if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED))
2664 e = rep(env, e, 0, 0);
2665 break;
2666 case T_OPEN:
2667 tok = env->token;
2668 eat(env);
2669 flags = env->flags;
2670 type = env->type;
2671 if (env->token.att)
2672 env->flags |= REG_MINIMAL;
2673 env->parnest++;
2674 if (env->type == KRE)
2675 ++env->parno;
2676 parno = ++env->parno;
2677 if (!(e = alt(env, parno + 1, 0)))
2678 break;
2679 if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL))
2680 {
2681 drop(env->disc, e);
2682 env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL;
2683 return 0;
2684 }
2685 if (token(env) != T_CLOSE)
2686 {
2687 drop(env->disc, e);
2688 env->error = REG_EPAREN;
2689 return 0;
2690 }
2691 env->parnest--;
2692 eat(env);
2693 if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2694 {
2695 drop(env->disc, e);
2696 return 0;
2697 }
2698 if (parno < elementsof(env->paren))
2699 env->paren[parno] = f;
2700 f->re.group.back = 0;
2701 f->re.group.number = parno;
2702 f->re.group.expr.rex = e;
2703 if (tok.lex)
2704 {
2705 tok.push = 1;
2706 env->token = tok;
2707 }
2708 if (!(e = rep(env, f, parno, env->parno)))
2709 return 0;
2710 if (env->type == KRE)
2711 {
2712 if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2713 {
2714 drop(env->disc, e);
2715 return 0;
2716 }
2717 if (--parno < elementsof(env->paren))
2718 env->paren[parno] = f;
2719 f->re.group.back = 0;
2720 f->re.group.number = parno;
2721 f->re.group.expr.rex = e;
2722 e = f;
2723 }
2724 env->flags = flags;
2725 env->type = type;
2726 break;
2727 case T_GROUP:
2728 p = env->cursor;
2729 eat(env);
2730 flags = env->flags;
2731 type = env->type;
2732 if (!(e = grp(env, env->parno + 1)))
2733 {
2734 if (env->error)
2735 return 0;
2736 if (env->literal == env->pattern && env->literal == p)
2737 env->literal = env->cursor;
2738 continue;
2739 }
2740 env->flags = flags;
2741 env->type = type;
2742 break;
2743 case T_BRA:
2744 eat(env);
2745 if (e = bra(env))
2746 e = rep(env, e, 0, 0);
2747 break;
2748 case T_ALNUM:
2749 case T_ALNUM_NOT:
2750 case T_DIGIT:
2751 case T_DIGIT_NOT:
2752 case T_SPACE:
2753 case T_SPACE_NOT:
2754 eat(env);
2755 if (e = ccl(env, c))
2756 e = rep(env, e, 0, 0);
2757 break;
2758 case T_LT:
2759 eat(env);
2760 e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0);
2761 break;
2762 case T_GT:
2763 eat(env);
2764 e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0);
2765 break;
2766 case T_DOT:
2767 eat(env);
2768 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
2769 break;
2770 case T_DOTSTAR:
2771 eat(env);
2772 env->token.lex = T_STAR;
2773 env->token.push = 1;
2774 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
2775 break;
2776 case T_SLASHPLUS:
2777 eat(env);
2778 env->token.lex = T_PLUS;
2779 env->token.push = 1;
2780 if (e = node(env, REX_ONECHAR, 1, 1, 0))
2781 {
2782 e->re.onechar = '/';
2783 e = rep(env, e, 0, 0);
2784 }
2785 break;
2786 case T_WORD:
2787 eat(env);
2788 e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0);
2789 break;
2790 case T_WORD_NOT:
2791 eat(env);
2792 e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0);
2793 break;
2794 case T_BEG_STR:
2795 eat(env);
2796 e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0);
2797 break;
2798 case T_END_STR:
2799 eat(env);
2800 e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0);
2801 break;
2802 case T_FIN_STR:
2803 eat(env);
2804 e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0);
2805 break;
2806 default:
2807 env->error = REG_BADRPT;
2808 return 0;
2809 }
2810 if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator)
2811 e = cat(env, e, seq(env));
2812 return e;
2813 }
2814 }
2815
2816 static Rex_t*
con(Cenv_t * env)2817 con(Cenv_t* env)
2818 {
2819 Rex_t* e;
2820 Rex_t* f;
2821 Rex_t* g;
2822
2823 if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND)
2824 return e;
2825 eat(env);
2826 if (!(f = con(env)))
2827 {
2828 drop(env->disc, e);
2829 return 0;
2830 }
2831 if (!(g = node(env, REX_CONJ, 0, 0, 0)))
2832 {
2833 drop(env->disc, e);
2834 drop(env->disc, f);
2835 return 0;
2836 }
2837 g->re.group.expr.binary.left = e;
2838 g->re.group.expr.binary.right = f;
2839 return g;
2840 }
2841
2842 static Rex_t*
alt(Cenv_t * env,int number,int cond)2843 alt(Cenv_t* env, int number, int cond)
2844 {
2845 Rex_t* e;
2846 Rex_t* f;
2847 Rex_t* g;
2848
2849 if (!(e = con(env)))
2850 return 0;
2851 else if (token(env) != T_BAR)
2852 {
2853 if (!cond)
2854 return e;
2855 f = 0;
2856 if (e->type == REX_NULL)
2857 goto bad;
2858 }
2859 else
2860 {
2861 eat(env);
2862 if (!(f = alt(env, number, 0)))
2863 {
2864 drop(env->disc, e);
2865 return 0;
2866 }
2867 if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL))
2868 goto bad;
2869 if (!cond && (g = trie(env, e, f)))
2870 return g;
2871 }
2872 if (!(g = node(env, REX_ALT, 0, 0, 0)))
2873 {
2874 env->error = REG_ESPACE;
2875 goto bad;
2876 }
2877 g->re.group.number = number;
2878 g->re.group.last = env->parno;
2879 g->re.group.expr.binary.left = e;
2880 g->re.group.expr.binary.right = f;
2881 return g;
2882 bad:
2883 drop(env->disc, e);
2884 drop(env->disc, f);
2885 if (!env->error)
2886 env->error = REG_ENULL;
2887 return 0;
2888 }
2889
2890 /*
2891 * add v to REX_BM tables
2892 */
2893
2894 static void
bmstr(Cenv_t * env,register Rex_t * a,unsigned char * v,int n,Bm_mask_t b)2895 bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b)
2896 {
2897 int c;
2898 int m;
2899 size_t z;
2900
2901 for (m = 0; m < n; m++)
2902 {
2903 if (!(z = n - m - 1))
2904 z = HIT;
2905 c = v[m];
2906 a->re.bm.mask[m][c] |= b;
2907 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
2908 a->re.bm.skip[c] = z;
2909 if (a->flags & REG_ICASE)
2910 {
2911 if (isupper(c))
2912 c = tolower(c);
2913 else if (islower(c))
2914 c = toupper(c);
2915 else
2916 continue;
2917 a->re.bm.mask[m][c] |= b;
2918 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
2919 a->re.bm.skip[c] = z;
2920 }
2921 }
2922 }
2923
2924 /*
2925 * set up BM table from trie
2926 */
2927
2928 static int
bmtrie(Cenv_t * env,Rex_t * a,unsigned char * v,Trie_node_t * x,int n,int m,Bm_mask_t b)2929 bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b)
2930 {
2931 do
2932 {
2933 v[m] = x->c;
2934 if (m >= (n - 1))
2935 {
2936 bmstr(env, a, v, n, b);
2937 if (!(b <<= 1))
2938 {
2939 b = 1;
2940 a->re.bm.complete = 0;
2941 }
2942 else if (x->son)
2943 a->re.bm.complete = 0;
2944 }
2945 else if (x->son)
2946 b = bmtrie(env, a, v, x->son, n, m + 1, b);
2947 } while (x = x->sib);
2948 return b;
2949 }
2950
2951 /*
2952 * rewrite the expression tree for some special cases
2953 * 1. it is a null expression - illegal
2954 * 2. max length fixed string found -- use BM algorithm
2955 * 3. it begins with an unanchored string - use KMP algorithm
2956 * 0 returned on success
2957 */
2958
2959 static int
special(Cenv_t * env,regex_t * p)2960 special(Cenv_t* env, regex_t* p)
2961 {
2962 Rex_t* a;
2963 Rex_t* e;
2964 Rex_t* t;
2965 Rex_t* x;
2966 Rex_t* y;
2967 unsigned char* s;
2968 int* f;
2969 int n;
2970 int m;
2971 int k;
2972
2973 DEBUG_INIT();
2974 if (e = p->env->rex)
2975 {
2976 if ((x = env->stats.x) && x->re.string.size < 3)
2977 x = 0;
2978 if ((t = env->stats.y) && t->re.trie.min < 3)
2979 t = 0;
2980 if (x && t)
2981 {
2982 if (x->re.string.size >= t->re.trie.min)
2983 t = 0;
2984 else
2985 x = 0;
2986 }
2987 if (x || t)
2988 {
2989 Bm_mask_t** mask;
2990 Bm_mask_t* h;
2991 unsigned char* v;
2992 size_t* q;
2993 unsigned long l;
2994 int i;
2995 int j;
2996
2997 if (x)
2998 {
2999 y = x;
3000 n = m = x->re.string.size;
3001 l = env->stats.l;
3002 }
3003 else
3004 {
3005 y = t;
3006 n = t->re.trie.min;
3007 m = t->re.trie.max;
3008 l = env->stats.k;
3009 }
3010 if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t))))
3011 return 1;
3012 if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t))))
3013 {
3014 alloc(env->disc, q, 0);
3015 return 1;
3016 }
3017 a->flags = y->flags;
3018 a->map = y->map;
3019 a->re.bm.size = n;
3020 a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1;
3021 a->re.bm.left = l - 1;
3022 a->re.bm.right = env->stats.m - l - n;
3023 a->re.bm.complete = (env->stats.e || y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n;
3024 h = (Bm_mask_t*)&a->re.bm.mask[n];
3025 a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1));
3026 a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1];
3027 for (m = 0; m <= UCHAR_MAX; m++)
3028 a->re.bm.skip[m] = n;
3029 a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left);
3030 for (i = 1; i <= n; i++)
3031 a->re.bm.fail[i] = 2 * n - i;
3032 mask = a->re.bm.mask;
3033 for (m = 0; m < n; m++)
3034 {
3035 mask[m] = h;
3036 h += UCHAR_MAX + 1;
3037 }
3038 if (x)
3039 bmstr(env, a, x->re.string.base, n, 1);
3040 else
3041 {
3042 v = (unsigned char*)q;
3043 memset(v, 0, n);
3044 m = 1;
3045 for (i = 0; i <= UCHAR_MAX; i++)
3046 if (t->re.trie.root[i])
3047 m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m);
3048 }
3049 mask--;
3050 memset(q, 0, n * sizeof(*q));
3051 for (k = (j = n) + 1; j > 0; j--, k--)
3052 {
3053 DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0));
3054 for (q[j] = k; k <= n; k = q[k])
3055 {
3056 for (m = 0; m <= UCHAR_MAX; m++)
3057 if (mask[k][m] == mask[j][m])
3058 {
3059 DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0));
3060 goto cut;
3061 }
3062 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0));
3063 if (a->re.bm.fail[k] > n - j)
3064 a->re.bm.fail[k] = n - j;
3065 }
3066 cut: ;
3067 }
3068 for (i = 1; i <= n; i++)
3069 if (a->re.bm.fail[i] > n + k - i)
3070 {
3071 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0));
3072 a->re.bm.fail[i] = n + k - i;
3073 }
3074 #if _AST_REGEX_DEBUG
3075 if (DEBUG_TEST(0x0020,(1),(0)))
3076 {
3077 sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0);
3078 for (m = 0; m < n; m++)
3079 for (i = 1; i <= UCHAR_MAX; i++)
3080 if (a->re.bm.mask[m][i])
3081 sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]);
3082 for (i = ' '; i <= UCHAR_MAX; i++)
3083 if (a->re.bm.skip[i] >= HIT)
3084 sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i);
3085 else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n)
3086 sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]);
3087 for (j = 31; j >= 0; j--)
3088 {
3089 sfprintf(sfstderr, " ");
3090 next:
3091 for (m = 0; m < n; m++)
3092 {
3093 for (i = 0040; i < 0177; i++)
3094 if (a->re.bm.mask[m][i] & (1 << j))
3095 {
3096 sfprintf(sfstderr, " %c", i);
3097 break;
3098 }
3099 if (i >= 0177)
3100 {
3101 if (j > 0)
3102 {
3103 j--;
3104 goto next;
3105 }
3106 sfprintf(sfstderr, " ?");
3107 }
3108 }
3109 sfprintf(sfstderr, "\n");
3110 }
3111 sfprintf(sfstderr, "FAIL: ");
3112 for (m = 1; m <= n; m++)
3113 sfprintf(sfstderr, "%3d", a->re.bm.fail[m]);
3114 sfprintf(sfstderr, "\n");
3115 }
3116 #endif
3117 alloc(env->disc, q, 0);
3118 a->next = e;
3119 p->env->rex = a;
3120 return 0;
3121 }
3122 switch (e->type)
3123 {
3124 case REX_BEG:
3125 if (env->flags & REG_NEWLINE)
3126 return 0;
3127 break;
3128 case REX_GROUP:
3129 if (env->stats.b)
3130 return 0;
3131 e = e->re.group.expr.rex;
3132 if (e->type != REX_DOT)
3133 return 0;
3134 /*FALLTHROUGH*/
3135 case REX_DOT:
3136 if (e->lo == 0 && e->hi == RE_DUP_INF)
3137 break;
3138 return 0;
3139 case REX_NULL:
3140 if (env->flags & REG_NULL)
3141 break;
3142 env->error = REG_ENULL;
3143 return 1;
3144 case REX_STRING:
3145 if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map)
3146 return 0;
3147 s = e->re.string.base;
3148 n = e->re.string.size;
3149 if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1))))
3150 return 1;
3151 a->flags = e->flags;
3152 a->map = e->map;
3153 f = a->re.string.fail;
3154 memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n);
3155 s = a->re.string.base;
3156 a->re.string.size = n;
3157 f[0] = m = -1;
3158 for (k = 1; k < n; k++)
3159 {
3160 while (m >= 0 && s[m+1] != s[k])
3161 m = f[m];
3162 if (s[m+1] == s[k])
3163 m++;
3164 f[k] = m;
3165 }
3166 a->next = e->next;
3167 p->env->rex = a;
3168 e->next = 0;
3169 drop(env->disc, e);
3170 break;
3171 default:
3172 return 0;
3173 }
3174 }
3175 p->env->once = 1;
3176 return 0;
3177 }
3178
3179 int
regcomp(regex_t * p,const char * pattern,regflags_t flags)3180 regcomp(regex_t* p, const char* pattern, regflags_t flags)
3181 {
3182 Rex_t* e;
3183 Rex_t* f;
3184 regdisc_t* disc;
3185 unsigned char* fold;
3186 int i;
3187 Cenv_t env;
3188
3189 if (!p)
3190 return REG_BADPAT;
3191 if (flags & REG_DISCIPLINE)
3192 {
3193 flags &= ~REG_DISCIPLINE;
3194 disc = p->re_disc;
3195 }
3196 else
3197 disc = &state.disc;
3198 if (!disc->re_errorlevel)
3199 disc->re_errorlevel = 2;
3200 p->env = 0;
3201 if (!pattern)
3202 return fatal(disc, REG_BADPAT, pattern);
3203 if (!state.initialized)
3204 {
3205 state.initialized = 1;
3206 for (i = 0; i < elementsof(state.escape); i++)
3207 state.magic[state.escape[i].key] = state.escape[i].val;
3208 }
3209 if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data))
3210 {
3211 if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1)))
3212 return fatal(disc, REG_ESPACE, pattern);
3213 for (i = 0; i <= UCHAR_MAX; i++)
3214 fold[i] = toupper(i);
3215 LCINFO(AST_LC_CTYPE)->data = (void*)fold;
3216 }
3217 again:
3218 if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t))))
3219 return fatal(disc, REG_ESPACE, pattern);
3220 memset(p->env, 0, sizeof(*p->env));
3221 memset(&env, 0, sizeof(env));
3222 env.regex = p;
3223 env.flags = flags;
3224 env.disc = p->env->disc = disc;
3225 if (env.flags & REG_AUGMENTED)
3226 env.flags |= REG_EXTENDED;
3227 env.mappeddot = '.';
3228 env.mappednewline = '\n';
3229 env.mappedslash = '/';
3230 if (disc->re_version >= REG_VERSION_MAP && disc->re_map)
3231 {
3232 env.map = disc->re_map;
3233 env.MAP = p->env->fold;
3234 for (i = 0; i <= UCHAR_MAX; i++)
3235 {
3236 env.MAP[i] = fold[env.map[i]];
3237 if (env.map[i] == '.')
3238 env.mappeddot = i;
3239 if (env.map[i] == '\n')
3240 env.mappednewline = i;
3241 if (env.map[i] == '/')
3242 env.mappedslash = i;
3243 }
3244 }
3245 else
3246 env.MAP = fold;
3247 env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE;
3248 env.explicit = -1;
3249 if (env.flags & REG_SHELL)
3250 {
3251 if (env.flags & REG_SHELL_PATH)
3252 env.explicit = env.mappedslash;
3253 env.flags |= REG_LENIENT|REG_NULL;
3254 env.type = env.type == BRE ? SRE : KRE;
3255 }
3256 if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE)
3257 env.explicit = env.mappednewline;
3258 p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1;
3259 env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL));
3260 env.regexp = !!(env.flags & REG_REGEXP);
3261 env.token.lex = 0;
3262 env.token.push = 0;
3263 if (env.flags & REG_DELIMITED)
3264 {
3265 switch (env.delimiter = *pattern++)
3266 {
3267 case 0:
3268 case '\\':
3269 case '\n':
3270 case '\r':
3271 env.error = REG_EDELIM;
3272 goto bad;
3273 }
3274 env.terminator = '\n';
3275 }
3276 env.literal = env.pattern = env.cursor = (unsigned char*)pattern;
3277 if (!(p->env->rex = alt(&env, 1, 0)))
3278 goto bad;
3279 if (env.parnest)
3280 {
3281 env.error = REG_EPAREN;
3282 goto bad;
3283 }
3284 p->env->stats.re_flags = env.flags & (REG_EXTENDED|REG_AUGMENTED|REG_SHELL);
3285 if (env.flags & REG_LEFT)
3286 {
3287 if (p->env->rex->type != REX_BEG)
3288 {
3289 if (p->env->rex->type == REX_ALT)
3290 env.flags &= ~REG_FIRST;
3291 if (!(e = node(&env, REX_BEG, 0, 0, 0)))
3292 {
3293 regfree(p);
3294 return fatal(disc, REG_ESPACE, pattern);
3295 }
3296 e->next = p->env->rex;
3297 p->env->rex = e;
3298 p->env->once = 1;
3299 }
3300 p->env->stats.re_flags |= REG_LEFT;
3301 }
3302 for (e = p->env->rex; e->next; e = e->next);
3303 p->env->done.type = REX_DONE;
3304 p->env->done.flags = e->flags;
3305 if (env.flags & REG_RIGHT)
3306 {
3307 if (e->type != REX_END)
3308 {
3309 if (p->env->rex->type == REX_ALT)
3310 env.flags &= ~REG_FIRST;
3311 if (!(f = node(&env, REX_END, 0, 0, 0)))
3312 {
3313 regfree(p);
3314 return fatal(disc, REG_ESPACE, pattern);
3315 }
3316 f->flags = e->flags;
3317 f->map = e->map;
3318 e->next = f;
3319 }
3320 p->env->stats.re_flags |= REG_RIGHT;
3321 }
3322 if (stats(&env, p->env->rex))
3323 {
3324 if (!env.error)
3325 env.error = REG_ECOUNT;
3326 goto bad;
3327 }
3328 if (env.stats.b)
3329 p->env->hard = p->env->separate = 1;
3330 else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c)))
3331 p->env->hard = 1;
3332 if (p->env->hard || env.stats.c || env.stats.i)
3333 p->env->stats.re_min = p->env->stats.re_max = -1;
3334 else
3335 {
3336 if (!(p->env->stats.re_min = env.stats.m))
3337 p->env->stats.re_min = -1;
3338 if (!(p->env->stats.re_max = env.stats.n))
3339 p->env->stats.re_max = -1;
3340 }
3341 if (special(&env, p))
3342 goto bad;
3343 serialize(&env, p->env->rex, 1);
3344 p->re_nsub = env.stats.p;
3345 if (env.type == KRE)
3346 p->re_nsub /= 2;
3347 if (env.flags & REG_DELIMITED)
3348 {
3349 p->re_npat = env.cursor - env.pattern + 1;
3350 if (*env.cursor == env.delimiter)
3351 p->re_npat++;
3352 else if (env.flags & REG_MUSTDELIM)
3353 {
3354 env.error = REG_EDELIM;
3355 goto bad;
3356 }
3357 else
3358 env.flags &= ~REG_DELIMITED;
3359 }
3360 p->env->explicit = env.explicit;
3361 p->env->flags = env.flags & REG_COMP;
3362 p->env->min = env.stats.m;
3363 p->env->nsub = env.stats.p + env.stats.u;
3364 p->env->refs = 1;
3365 return 0;
3366 bad:
3367 regfree(p);
3368 if (!env.error)
3369 env.error = REG_ESPACE;
3370 if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL))
3371 {
3372 flags |= REG_LITERAL;
3373 pattern = (const char*)env.literal;
3374 goto again;
3375 }
3376 return fatal(disc, env.error, pattern);
3377 }
3378
3379 /*
3380 * regcomp() on sized pattern
3381 * the lazy way around adding and checking env.end
3382 */
3383
3384 int
regncomp(regex_t * p,const char * pattern,size_t size,regflags_t flags)3385 regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags)
3386 {
3387 char* s;
3388 int r;
3389
3390 if (!(s = malloc(size + 1)))
3391 return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern);
3392 memcpy(s, pattern, size);
3393 s[size] = 0;
3394 r = regcomp(p, s, flags);
3395 free(s);
3396 return r;
3397 }
3398
3399 /*
3400 * combine two compiled regular expressions if possible,
3401 * replacing first with the combination and freeing second.
3402 * return 0 on success.
3403 * the only combinations handled are building a Trie
3404 * from String|Kmp|Trie and String|Kmp
3405 */
3406
3407 int
regcomb(regex_t * p,regex_t * q)3408 regcomb(regex_t* p, regex_t* q)
3409 {
3410 Rex_t* e = p->env->rex;
3411 Rex_t* f = q->env->rex;
3412 Rex_t* g;
3413 Rex_t* h;
3414 Cenv_t env;
3415
3416 if (!e || !f)
3417 return fatal(p->env->disc, REG_BADPAT, NiL);
3418 if (p->env->separate || q->env->separate)
3419 return REG_ESUBREG;
3420 memset(&env, 0, sizeof(env));
3421 env.disc = p->env->disc;
3422 if (e->type == REX_BM)
3423 {
3424 p->env->rex = e->next;
3425 e->next = 0;
3426 drop(env.disc, e);
3427 e = p->env->rex;
3428 }
3429 if (f->type == REX_BM)
3430 {
3431 q->env->rex = f->next;
3432 f->next = 0;
3433 drop(env.disc, f);
3434 f = q->env->rex;
3435 }
3436 if (e->type == REX_BEG && f->type == REX_BEG)
3437 {
3438 p->env->flags |= REG_LEFT;
3439 p->env->rex = e->next;
3440 e->next = 0;
3441 drop(env.disc, e);
3442 e = p->env->rex;
3443 q->env->rex = f->next;
3444 f->next = 0;
3445 drop(env.disc, f);
3446 f = q->env->rex;
3447 }
3448 for (g = e; g->next; g = g->next);
3449 for (h = f; h->next; h = h->next);
3450 if (g->next && g->next->type == REX_END && h->next && h->next->type == REX_END)
3451 {
3452 p->env->flags |= REG_RIGHT;
3453 drop(env.disc, g->next);
3454 g->next = 0;
3455 drop(env.disc, h->next);
3456 h->next = 0;
3457 }
3458 if (!(g = trie(&env, f, e)))
3459 return fatal(p->env->disc, REG_BADPAT, NiL);
3460 p->env->rex = g;
3461 if (!q->env->once)
3462 p->env->once = 0;
3463 q->env->rex = 0;
3464 if (p->env->flags & REG_LEFT)
3465 {
3466 if (!(e = node(&env, REX_BEG, 0, 0, 0)))
3467 {
3468 regfree(p);
3469 return fatal(p->env->disc, REG_ESPACE, NiL);
3470 }
3471 e->next = p->env->rex;
3472 p->env->rex = e;
3473 p->env->once = 1;
3474 }
3475 if (p->env->flags & REG_RIGHT)
3476 {
3477 for (f = p->env->rex; f->next; f = f->next);
3478 if (f->type != REX_END)
3479 {
3480 if (!(e = node(&env, REX_END, 0, 0, 0)))
3481 {
3482 regfree(p);
3483 return fatal(p->env->disc, REG_ESPACE, NiL);
3484 }
3485 f->next = e;
3486 }
3487 }
3488 env.explicit = p->env->explicit;
3489 env.flags = p->env->flags;
3490 env.disc = p->env->disc;
3491 if (stats(&env, p->env->rex))
3492 {
3493 regfree(p);
3494 return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL);
3495 }
3496 if (special(&env, p))
3497 {
3498 regfree(p);
3499 return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL);
3500 }
3501 p->env->min = g->re.trie.min;
3502 return 0;
3503 }
3504
3505 /*
3506 * copy a reference of p into q
3507 * p and q may then have separate regsubcomp() instantiations
3508 */
3509
3510 int
regdup(regex_t * p,regex_t * q)3511 regdup(regex_t* p, regex_t* q)
3512 {
3513 if (!p || !q)
3514 return REG_BADPAT;
3515 *q = *p;
3516 p->env->refs++;
3517 q->re_sub = 0;
3518 return 0;
3519 }
3520