1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1985-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * Phong Vo <kpv@research.att.com> *
20 * *
21 ***********************************************************************/
22 #pragma prototyped
23
24 /*
25 * posix regex executor
26 * single sized-record interface
27 */
28
29 #include "reglib.h"
30
31 #if _AST_REGEX_DEBUG
32
33 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n))
34 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0)
35 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_exec_debug")) debug |= strtoul(t, NiL, 0); } } while (0)
36
37 static unsigned long debug;
38 static unsigned long debug_flag;
39
40 static const char* rexnames[] =
41 {
42 "REX_NULL",
43 "REX_ALT",
44 "REX_ALT_CATCH",
45 "REX_BACK",
46 "REX_BEG",
47 "REX_BEG_STR",
48 "REX_BM",
49 "REX_CAT",
50 "REX_CLASS",
51 "REX_COLL_CLASS",
52 "REX_CONJ",
53 "REX_CONJ_LEFT",
54 "REX_CONJ_RIGHT",
55 "REX_DONE",
56 "REX_DOT",
57 "REX_END",
58 "REX_END_STR",
59 "REX_EXEC",
60 "REX_FIN_STR",
61 "REX_GROUP",
62 "REX_GROUP_CATCH",
63 "REX_GROUP_AHEAD",
64 "REX_GROUP_AHEAD_CATCH",
65 "REX_GROUP_AHEAD_NOT",
66 "REX_GROUP_BEHIND",
67 "REX_GROUP_BEHIND_CATCH",
68 "REX_GROUP_BEHIND_NOT",
69 "REX_GROUP_BEHIND_NOT_CATCH",
70 "REX_GROUP_COND",
71 "REX_GROUP_COND_CATCH",
72 "REX_GROUP_CUT",
73 "REX_GROUP_CUT_CATCH",
74 "REX_KMP",
75 "REX_NEG",
76 "REX_NEG_CATCH",
77 "REX_NEST",
78 "REX_ONECHAR",
79 "REX_REP",
80 "REX_REP_CATCH",
81 "REX_STRING",
82 "REX_TRIE",
83 "REX_WBEG",
84 "REX_WEND",
85 "REX_WORD",
86 "REX_WORD_NOT"
87 };
88
rexname(Rex_t * rex)89 static const char* rexname(Rex_t* rex)
90 {
91 if (!rex)
92 return "NIL";
93 if (rex->type >= elementsof(rexnames))
94 return "ERROR";
95 return rexnames[rex->type];
96 }
97
98 #else
99
100 #define DEBUG_INIT()
101 #define DEBUG_TEST(f,y,n) (n)
102 #define DEBUG_CODE(f,y,n) do {n} while(0)
103
104 #endif
105
106 #define BEG_ALT 1 /* beginning of an alt */
107 #define BEG_ONE 2 /* beginning of one iteration of a rep */
108 #define BEG_REP 3 /* beginning of a repetition */
109 #define BEG_SUB 4 /* beginning of a subexpression */
110 #define END_ANY 5 /* end of any of above */
111
112 /*
113 * returns from parse()
114 */
115
116 #define NONE 0 /* no parse found */
117 #define GOOD 1 /* some parse was found */
118 #define CUT 2 /* no match and no backtrack */
119 #define BEST 3 /* an unbeatable parse was found */
120 #define BAD 4 /* error ocurred */
121
122 /*
123 * REG_SHELL_DOT test
124 */
125
126 #define LEADING(e,r,s) (*(s)==(e)->leading&&((s)==(e)->beg||*((s)-1)==(r)->explicit))
127
128 /*
129 * Pos_t is for comparing parses. An entry is made in the
130 * array at the beginning and at the end of each Group_t,
131 * each iteration in a Group_t, and each Binary_t.
132 */
133
134 typedef struct
135 {
136 unsigned char* p; /* where in string */
137 size_t length; /* length in string */
138 short serial; /* preorder subpattern number */
139 short be; /* which end of pair */
140 } Pos_t;
141
142 /* ===== begin library support ===== */
143
144 #define vector(t,v,i) (((i)<(v)->max)?(t*)((v)->vec+(i)*(v)->siz):(t*)vecseek(&(v),i))
145
146 static Vector_t*
vecopen(int inc,int siz)147 vecopen(int inc, int siz)
148 {
149 Vector_t* v;
150 Stk_t* sp;
151
152 if (inc <= 0)
153 inc = 16;
154 if (!(sp = stkopen(STK_SMALL|STK_NULL)))
155 return 0;
156 if (!(v = (Vector_t*)stkseek(sp, sizeof(Vector_t) + inc * siz)))
157 {
158 stkclose(sp);
159 return 0;
160 }
161 v->stk = sp;
162 v->vec = (char*)v + sizeof(Vector_t);
163 v->max = v->inc = inc;
164 v->siz = siz;
165 v->cur = 0;
166 return v;
167 }
168
169 static void*
vecseek(Vector_t ** p,int index)170 vecseek(Vector_t** p, int index)
171 {
172 Vector_t* v = *p;
173
174 if (index >= v->max)
175 {
176 while ((v->max += v->inc) <= index);
177 if (!(v = (Vector_t*)stkseek(v->stk, sizeof(Vector_t) + v->max * v->siz)))
178 return 0;
179 *p = v;
180 v->vec = (char*)v + sizeof(Vector_t);
181 }
182 return v->vec + index * v->siz;
183 }
184
185 static void
vecclose(Vector_t * v)186 vecclose(Vector_t* v)
187 {
188 if (v)
189 stkclose(v->stk);
190 }
191
192 typedef struct
193 {
194 Stk_pos_t pos;
195 char data[1];
196 } Stk_frame_t;
197
198 #define stknew(s,p) ((p)->offset=stktell(s),(p)->base=stkfreeze(s,0))
199 #define stkold(s,p) stkset(s,(p)->base,(p)->offset)
200
201 #define stkframe(s) (*((Stk_frame_t**)stktop(s)-1))
202 #define stkdata(s,t) ((t*)stkframe(s)->data)
203 #define stkpop(s) stkold(s,&(stkframe(s)->pos))
204
205 static void*
stkpush(Stk_t * sp,size_t size)206 stkpush(Stk_t* sp, size_t size)
207 {
208 Stk_frame_t* f;
209 Stk_pos_t p;
210
211 stknew(sp, &p);
212 size = sizeof(Stk_frame_t) + sizeof(size_t) + size - 1;
213 if (!(f = (Stk_frame_t*)stkalloc(sp, sizeof(Stk_frame_t) + sizeof(Stk_frame_t*) + size - 1)))
214 return 0;
215 f->pos = p;
216 stkframe(sp) = f;
217 return f->data;
218 }
219
220 /* ===== end library support ===== */
221
222 /*
223 * Match_frame_t is for saving and restoring match records
224 * around alternate attempts, so that fossils will not be
225 * left in the match array. These are the only entries in
226 * the match array that are not otherwise guaranteed to
227 * have current data in them when they get used.
228 */
229
230 typedef struct
231 {
232 size_t size;
233 regmatch_t* match;
234 regmatch_t save[1];
235 } Match_frame_t;
236
237 #define matchframe stkdata(stkstd,Match_frame_t)
238 #define matchpush(e,x) ((x)->re.group.number?_matchpush(e,x):0)
239 #define matchcopy(e,x) ((x)->re.group.number?memcpy(matchframe->match,matchframe->save,matchframe->size):(void*)0)
240 #define matchpop(e,x) ((x)->re.group.number?memcpy(matchframe->match,matchframe->save,matchframe->size),stkpop(stkstd):(void*)0)
241
242 #define pospop(e) (--(e)->pos->cur)
243
244 /*
245 * allocate a frame and push a match onto the stack
246 */
247
248 static int
_matchpush(Env_t * env,Rex_t * rex)249 _matchpush(Env_t* env, Rex_t* rex)
250 {
251 Match_frame_t* f;
252 regmatch_t* m;
253 regmatch_t* e;
254 regmatch_t* s;
255 int num;
256
257 if (rex->re.group.number <= 0 || (num = rex->re.group.last - rex->re.group.number + 1) <= 0)
258 num = 0;
259 if (!(f = (Match_frame_t*)stkpush(stkstd, sizeof(Match_frame_t) + (num - 1) * sizeof(regmatch_t))))
260 {
261 env->error = REG_ESPACE;
262 return 1;
263 }
264 f->size = num * sizeof(regmatch_t);
265 f->match = m = env->match + rex->re.group.number;
266 e = m + num;
267 s = f->save;
268 while (m < e)
269 {
270 *s++ = *m;
271 *m++ = state.nomatch;
272 }
273 return 0;
274 }
275
276 /*
277 * allocate a frame and push a pos onto the stack
278 */
279
280 static int
pospush(Env_t * env,Rex_t * rex,unsigned char * p,int be)281 pospush(Env_t* env, Rex_t* rex, unsigned char* p, int be)
282 {
283 Pos_t* pos;
284
285 if (!(pos = vector(Pos_t, env->pos, env->pos->cur)))
286 {
287 env->error = REG_ESPACE;
288 return 1;
289 }
290 pos->serial = rex->serial;
291 pos->p = p;
292 pos->be = be;
293 env->pos->cur++;
294 return 0;
295 }
296
297 /*
298 * two matches are known to have the same length
299 * os is start of old pos array, ns is start of new,
300 * oend and nend are end+1 pointers to ends of arrays.
301 * oe and ne are ends (not end+1) of subarrays.
302 * returns 1 if new is better, -1 if old, else 0.
303 */
304
305 static int
better(Env_t * env,Pos_t * os,Pos_t * ns,Pos_t * oend,Pos_t * nend,int level)306 better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level)
307 {
308 Pos_t* oe;
309 Pos_t* ne;
310 int k;
311 int n;
312
313 if (env->error)
314 return -1;
315 for (;;)
316 {
317 DEBUG_CODE(0x0080,{sfprintf(sfstdout, " %-*.*sold ", (level + 3) * 4, (level + 3) * 4, "");for (oe = os; oe < oend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n %-*.*snew ", (level + 3) * 4, (level + 3) * 4, "");for (oe = ns; oe < nend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n");},{0;});
318 if (ns >= nend)
319 return DEBUG_TEST(0x8000,(os < oend),(0));
320 if (os >= oend)
321 return DEBUG_TEST(0x8000,(-1),(1));
322 n = os->serial;
323 if (ns->serial > n)
324 return -1;
325 if (n > ns->serial)
326 {
327 env->error = REG_PANIC;
328 return -1;
329 }
330 if (ns->p > os->p)
331 return 1;
332 if (os->p > ns->p)
333 return -1;
334 oe = os;
335 k = 0;
336 for (;;)
337 if ((++oe)->serial == n)
338 {
339 if (oe->be != END_ANY)
340 k++;
341 else if (k-- <= 0)
342 break;
343 }
344 ne = ns;
345 k = 0;
346 for (;;)
347 if ((++ne)->serial == n)
348 {
349 if (ne->be != END_ANY)
350 k++;
351 else if (k-- <= 0)
352 break;
353 }
354 if (ne->p > oe->p)
355 return 1;
356 if (oe->p > ne->p)
357 return -1;
358 if (k = better(env, os + 1, ns + 1, oe, ne, level + 1))
359 return k;
360 os = oe + 1;
361 ns = ne + 1;
362 }
363 }
364
365 #if _AST_REGEX_DEBUG
366
367 static void
showmatch(regmatch_t * p)368 showmatch(regmatch_t* p)
369 {
370 sfputc(sfstdout, '(');
371 if (p->rm_so < 0)
372 sfputc(sfstdout, '?');
373 else
374 sfprintf(sfstdout, "%d", p->rm_so);
375 sfputc(sfstdout, ',');
376 if (p->rm_eo < 0)
377 sfputc(sfstdout, '?');
378 else
379 sfprintf(sfstdout, "%d", p->rm_eo);
380 sfputc(sfstdout, ')');
381 }
382
383 static int
_better(Env_t * env,Pos_t * os,Pos_t * ns,Pos_t * oend,Pos_t * nend,int level)384 _better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level)
385 {
386 int i;
387
388 DEBUG_CODE(0x0040,{sfprintf(sfstdout, "AHA better old ");for (i = 0; i <= env->nsub; i++)showmatch(&env->best[i]);sfprintf(sfstdout, "\n new ");for (i = 0; i <= env->nsub; i++)showmatch(&env->match[i]);sfprintf(sfstdout, "\n");},{0;});
389 i = better(env, os, ns, oend, nend, 0);
390 DEBUG_TEST(0x0040,(sfprintf(sfstdout, " %s\n", i <= 0 ? "OLD" : "NEW")),(0));
391 return i;
392 }
393
394 #define better _better
395
396 #endif
397
398 #define follow(e,r,c,s) ((r)->next?parse(e,(r)->next,c,s):(c)?parse(e,c,0,s):BEST)
399
400 static int parse(Env_t*, Rex_t*, Rex_t*, unsigned char*);
401
402 static int
parserep(Env_t * env,Rex_t * rex,Rex_t * cont,unsigned char * s,int n)403 parserep(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s, int n)
404 {
405 int i;
406 int r = NONE;
407 Rex_t catcher;
408
409 DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep %s %d %d %d %d `%-.*s'\n", __LINE__, debug_flag, rexname(rex->re.group.expr.rex), rex->re.group.number, rex->lo, n, rex->hi, env->end - s, s)),(0));
410 if ((rex->flags & REG_MINIMAL) && n >= rex->lo && n < rex->hi)
411 {
412 if (env->stack && pospush(env, rex, s, END_ANY))
413 return BAD;
414 i = follow(env, rex, cont, s);
415 if (env->stack)
416 pospop(env);
417 switch (i)
418 {
419 case BAD:
420 return BAD;
421 case CUT:
422 return CUT;
423 case BEST:
424 case GOOD:
425 return BEST;
426 }
427 }
428 if (n < rex->hi)
429 {
430 catcher.type = REX_REP_CATCH;
431 catcher.serial = rex->serial;
432 catcher.re.rep_catch.ref = rex;
433 catcher.re.rep_catch.cont = cont;
434 catcher.re.rep_catch.beg = s;
435 catcher.re.rep_catch.n = n + 1;
436 catcher.next = rex->next;
437 if (n == 0)
438 rex->re.rep_catch.beg = s;
439 if (env->stack)
440 {
441 if (matchpush(env, rex))
442 return BAD;
443 if (pospush(env, rex, s, BEG_ONE))
444 return BAD;
445 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x PUSH %d (%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo, env->match[2].rm_so, env->match[2].rm_eo)),(0));
446 }
447 r = parse(env, rex->re.group.expr.rex, &catcher, s);
448 DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep parse %d %d `%-.*s'\n", __LINE__, debug_flag, rex->re.group.number, r, env->end - s, s)),(0));
449 if (env->stack)
450 {
451 pospop(env);
452 matchpop(env, rex);
453 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x POP %d %d (%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, r, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo, env->match[2].rm_so, env->match[2].rm_eo)),(0));
454 }
455 switch (r)
456 {
457 case BAD:
458 return BAD;
459 case BEST:
460 return BEST;
461 case CUT:
462 r = NONE;
463 break;
464 case GOOD:
465 if (rex->flags & REG_MINIMAL)
466 return BEST;
467 r = GOOD;
468 break;
469 }
470 }
471 if (n < rex->lo)
472 return r;
473 if (!(rex->flags & REG_MINIMAL) || n >= rex->hi)
474 {
475 if (env->stack && pospush(env, rex, s, END_ANY))
476 return BAD;
477 i = follow(env, rex, cont, s);
478 if (env->stack)
479 pospop(env);
480 switch (i)
481 {
482 case BAD:
483 r = BAD;
484 break;
485 case CUT:
486 r = CUT;
487 break;
488 case BEST:
489 r = BEST;
490 break;
491 case GOOD:
492 r = (rex->flags & REG_MINIMAL) ? BEST : GOOD;
493 break;
494 }
495 }
496 return r;
497 }
498
499 static int
parsetrie(Env_t * env,Trie_node_t * x,Rex_t * rex,Rex_t * cont,unsigned char * s)500 parsetrie(Env_t* env, Trie_node_t* x, Rex_t* rex, Rex_t* cont, unsigned char* s)
501 {
502 unsigned char* p;
503 int r;
504
505 if (p = rex->map)
506 {
507 for (;;)
508 {
509 if (s >= env->end)
510 return NONE;
511 while (x->c != p[*s])
512 if (!(x = x->sib))
513 return NONE;
514 if (x->end)
515 break;
516 x = x->son;
517 s++;
518 }
519 }
520 else
521 {
522 for (;;)
523 {
524 if (s >= env->end)
525 return NONE;
526 while (x->c != *s)
527 if (!(x = x->sib))
528 return NONE;
529 if (x->end)
530 break;
531 x = x->son;
532 s++;
533 }
534 }
535 s++;
536 if (rex->flags & REG_MINIMAL)
537 switch (follow(env, rex, cont, s))
538 {
539 case BAD:
540 return BAD;
541 case CUT:
542 return CUT;
543 case BEST:
544 case GOOD:
545 return BEST;
546 }
547 if (x->son)
548 switch (parsetrie(env, x->son, rex, cont, s))
549 {
550 case BAD:
551 return BAD;
552 case CUT:
553 return CUT;
554 case BEST:
555 return BEST;
556 case GOOD:
557 if (rex->flags & REG_MINIMAL)
558 return BEST;
559 r = GOOD;
560 break;
561 default:
562 r = NONE;
563 break;
564 }
565 else
566 r = NONE;
567 if (!(rex->flags & REG_MINIMAL))
568 switch (follow(env, rex, cont, s))
569 {
570 case BAD:
571 return BAD;
572 case CUT:
573 return CUT;
574 case BEST:
575 return BEST;
576 case GOOD:
577 return GOOD;
578 }
579 return r;
580 }
581
582 static int
collelt(register Celt_t * ce,char * key,int c,int x)583 collelt(register Celt_t* ce, char* key, int c, int x)
584 {
585 Ckey_t elt;
586
587 mbxfrm(elt, key, COLL_KEY_MAX);
588 for (;; ce++)
589 {
590 switch (ce->typ)
591 {
592 case COLL_call:
593 if (!x && (*ce->fun)(c))
594 return 1;
595 continue;
596 case COLL_char:
597 if (!strcmp((char*)ce->beg, (char*)elt))
598 return 1;
599 continue;
600 case COLL_range:
601 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max)
602 return 1;
603 continue;
604 case COLL_range_lc:
605 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswlower(c) || !iswupper(c)))
606 return 1;
607 continue;
608 case COLL_range_uc:
609 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswupper(c) || !iswlower(c)))
610 return 1;
611 continue;
612 }
613 break;
614 }
615 return 0;
616 }
617
618 static int
collic(register Celt_t * ce,char * key,register char * nxt,int c,int x)619 collic(register Celt_t* ce, char* key, register char* nxt, int c, int x)
620 {
621 if (!x)
622 {
623 if (collelt(ce, key, c, x))
624 return 1;
625 if (iswlower(c))
626 c = towupper(c);
627 else if (iswupper(c))
628 c = towlower(c);
629 else
630 return 0;
631 x = mbconv(key, c);
632 key[x] = 0;
633 return collelt(ce, key, c, 0);
634 }
635 while (*nxt)
636 {
637 if (collic(ce, key, nxt + 1, c, x))
638 return 1;
639 if (islower(*nxt))
640 *nxt = toupper(*nxt);
641 else if (isupper(*nxt))
642 *nxt = tolower(*nxt);
643 else
644 return 0;
645 nxt++;
646 }
647 return collelt(ce, key, c, x);
648 }
649
650 static int
collmatch(Rex_t * rex,unsigned char * s,unsigned char * e,unsigned char ** p)651 collmatch(Rex_t* rex, unsigned char* s, unsigned char* e, unsigned char** p)
652 {
653 unsigned char* t;
654 wchar_t c;
655 int w;
656 int r;
657 int x;
658 int ic;
659 Ckey_t key;
660 Ckey_t elt;
661
662 ic = (rex->flags & REG_ICASE);
663 if ((w = MBSIZE(s)) > 1)
664 {
665 memcpy((char*)key, (char*)s, w);
666 key[w] = 0;
667 t = s;
668 c = mbchar(t);
669 #if !_lib_wctype
670 c &= 0xff;
671 #endif
672 x = 0;
673 }
674 else
675 {
676 c = s[0];
677 if (ic && isupper(c))
678 c = tolower(c);
679 key[0] = c;
680 key[1] = 0;
681 if (isalpha(c))
682 {
683 x = e - s;
684 if (x > COLL_KEY_MAX)
685 x = COLL_KEY_MAX;
686 while (w < x)
687 {
688 c = s[w];
689 if (!isalpha(c))
690 break;
691 r = mbxfrm(elt, key, COLL_KEY_MAX);
692 if (ic && isupper(c))
693 c = tolower(c);
694 key[w] = c;
695 key[w + 1] = 0;
696 if (mbxfrm(elt, key, COLL_KEY_MAX) != r)
697 break;
698 w++;
699 }
700 }
701 key[w] = 0;
702 c = key[0];
703 x = w - 1;
704 }
705 r = 1;
706 for (;;)
707 {
708 if (ic ? collic(rex->re.collate.elements, (char*)key, (char*)key, c, x) : collelt(rex->re.collate.elements, (char*)key, c, x))
709 break;
710 if (!x)
711 {
712 r = 0;
713 break;
714 }
715 w = x--;
716 key[w] = 0;
717 }
718 *p = s + w;
719 return rex->re.collate.invert ? !r : r;
720 }
721
722 static unsigned char*
nestmatch(register unsigned char * s,register unsigned char * e,const unsigned short * type,register int co)723 nestmatch(register unsigned char* s, register unsigned char* e, const unsigned short* type, register int co)
724 {
725 register int c;
726 register int cc;
727 unsigned int n;
728 int oc;
729
730 if (type[co] & (REX_NEST_literal|REX_NEST_quote))
731 {
732 n = (type[co] & REX_NEST_literal) ? REX_NEST_terminator : (REX_NEST_escape|REX_NEST_terminator);
733 while (s < e)
734 {
735 c = *s++;
736 if (c == co)
737 return s;
738 else if (type[c] & n)
739 {
740 if (s >= e || (type[c] & REX_NEST_terminator))
741 break;
742 s++;
743 }
744 }
745 }
746 else
747 {
748 cc = type[co] >> REX_NEST_SHIFT;
749 oc = type[co] & (REX_NEST_open|REX_NEST_close);
750 n = 1;
751 while (s < e)
752 {
753 c = *s++;
754 switch (type[c] & (REX_NEST_escape|REX_NEST_open|REX_NEST_close|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))
755 {
756 case REX_NEST_delimiter:
757 case REX_NEST_terminator:
758 return oc ? 0 : s;
759 case REX_NEST_separator:
760 if (!oc)
761 return s;
762 break;
763 case REX_NEST_escape:
764 if (s >= e)
765 return 0;
766 s++;
767 break;
768 case REX_NEST_open|REX_NEST_close:
769 if (c == cc)
770 {
771 if (!--n)
772 return s;
773 }
774 /*FALLTHROUGH*/
775 case REX_NEST_open:
776 if (c == co)
777 {
778 if (!++n)
779 return 0;
780 }
781 else if (!(s = nestmatch(s, e, type, c)))
782 return 0;
783 break;
784 case REX_NEST_close:
785 if (c != cc)
786 return 0;
787 if (!--n)
788 return s;
789 break;
790 }
791 }
792 return (oc || !(type[UCHAR_MAX+1] & REX_NEST_terminator)) ? 0 : s;
793 }
794 return 0;
795 }
796
797 static int
parse(Env_t * env,Rex_t * rex,Rex_t * cont,unsigned char * s)798 parse(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s)
799 {
800 int c;
801 int d;
802 int i;
803 int m;
804 int n;
805 int r;
806 int* f;
807 unsigned char* p;
808 unsigned char* t;
809 unsigned char* b;
810 unsigned char* e;
811 char* u;
812 regmatch_t* o;
813 Trie_node_t* x;
814 Rex_t* q;
815 Rex_t catcher;
816 Rex_t next;
817
818 for (;;)
819 {
820 DEBUG_TEST(0x0008,(sfprintf(sfstdout, "AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0));
821 switch (rex->type)
822 {
823 case REX_ALT:
824 if (env->stack)
825 {
826 if (matchpush(env, rex))
827 return BAD;
828 if (pospush(env, rex, s, BEG_ALT))
829 return BAD;
830 catcher.type = REX_ALT_CATCH;
831 catcher.serial = rex->serial;
832 catcher.re.alt_catch.cont = cont;
833 catcher.next = rex->next;
834 r = parse(env, rex->re.group.expr.binary.left, &catcher, s);
835 if (r < BEST || (rex->flags & REG_MINIMAL))
836 {
837 matchcopy(env, rex);
838 ((Pos_t*)env->pos->vec + env->pos->cur - 1)->serial = catcher.serial = rex->re.group.expr.binary.serial;
839 n = parse(env, rex->re.group.expr.binary.right, &catcher, s);
840 if (n != NONE)
841 r = n;
842 }
843 pospop(env);
844 matchpop(env, rex);
845 }
846 else
847 {
848 if ((r = parse(env, rex->re.group.expr.binary.left, cont, s)) == NONE)
849 r = parse(env, rex->re.group.expr.binary.right, cont, s);
850 if (r == GOOD)
851 r = BEST;
852 }
853 return r;
854 case REX_ALT_CATCH:
855 if (pospush(env, rex, s, END_ANY))
856 return BAD;
857 r = follow(env, rex, rex->re.alt_catch.cont, s);
858 pospop(env);
859 return r;
860 case REX_BACK:
861 o = &env->match[rex->lo];
862 if (o->rm_so < 0)
863 return NONE;
864 i = o->rm_eo - o->rm_so;
865 e = s + i;
866 if (e > env->end)
867 return NONE;
868 t = env->beg + o->rm_so;
869 if (!(p = rex->map))
870 {
871 while (s < e)
872 if (*s++ != *t++)
873 return NONE;
874 }
875 else if (!mbwide())
876 {
877 while (s < e)
878 if (p[*s++] != p[*t++])
879 return NONE;
880 }
881 else
882 {
883 while (s < e)
884 {
885 c = mbchar(s);
886 d = mbchar(t);
887 if (towupper(c) != towupper(d))
888 return NONE;
889 }
890 }
891 break;
892 case REX_BEG:
893 if ((!(rex->flags & REG_NEWLINE) || s <= env->beg || *(s - 1) != '\n') && ((env->flags & REG_NOTBOL) || s != env->beg))
894 return NONE;
895 break;
896 case REX_CLASS:
897 if (LEADING(env, rex, s))
898 return NONE;
899 n = rex->hi;
900 if (n > env->end - s)
901 n = env->end - s;
902 m = rex->lo;
903 if (m > n)
904 return NONE;
905 r = NONE;
906 if (!(rex->flags & REG_MINIMAL))
907 {
908 for (i = 0; i < n; i++)
909 if (!settst(rex->re.charclass, s[i]))
910 {
911 n = i;
912 break;
913 }
914 for (s += n; n-- >= m; s--)
915 switch (follow(env, rex, cont, s))
916 {
917 case BAD:
918 return BAD;
919 case CUT:
920 return CUT;
921 case BEST:
922 return BEST;
923 case GOOD:
924 r = GOOD;
925 break;
926 }
927 }
928 else
929 {
930 for (e = s + m; s < e; s++)
931 if (!settst(rex->re.charclass, *s))
932 return r;
933 e += n - m;
934 for (;;)
935 {
936 switch (follow(env, rex, cont, s))
937 {
938 case BAD:
939 return BAD;
940 case CUT:
941 return CUT;
942 case BEST:
943 case GOOD:
944 return BEST;
945 }
946 if (s >= e || !settst(rex->re.charclass, *s))
947 break;
948 s++;
949 }
950 }
951 return r;
952 case REX_COLL_CLASS:
953 if (LEADING(env, rex, s))
954 return NONE;
955 n = rex->hi;
956 if (n > env->end - s)
957 n = env->end - s;
958 m = rex->lo;
959 if (m > n)
960 return NONE;
961 r = NONE;
962 e = env->end;
963 if (!(rex->flags & REG_MINIMAL))
964 {
965 if (!(b = (unsigned char*)stkpush(stkstd, n)))
966 {
967 env->error = REG_ESPACE;
968 return BAD;
969 }
970 for (i = 0; s < e && i < n && collmatch(rex, s, e, &t); i++)
971 {
972 b[i] = t - s;
973 s = t;
974 }
975 for (; i-- >= rex->lo; s -= b[i])
976 switch (follow(env, rex, cont, s))
977 {
978 case BAD:
979 stkpop(stkstd);
980 return BAD;
981 case CUT:
982 stkpop(stkstd);
983 return CUT;
984 case BEST:
985 stkpop(stkstd);
986 return BEST;
987 case GOOD:
988 r = GOOD;
989 break;
990 }
991 stkpop(stkstd);
992 }
993 else
994 {
995 for (i = 0; i < m && s < e; i++, s = t)
996 if (!collmatch(rex, s, e, &t))
997 return r;
998 while (i++ <= n)
999 {
1000 switch (follow(env, rex, cont, s))
1001 {
1002 case BAD:
1003 return BAD;
1004 case CUT:
1005 return CUT;
1006 case BEST:
1007 case GOOD:
1008 return BEST;
1009 }
1010 if (s >= e || !collmatch(rex, s, e, &s))
1011 break;
1012 }
1013 }
1014 return r;
1015 case REX_CONJ:
1016 next.type = REX_CONJ_RIGHT;
1017 next.re.conj_right.cont = cont;
1018 next.next = rex->next;
1019 catcher.type = REX_CONJ_LEFT;
1020 catcher.re.conj_left.right = rex->re.group.expr.binary.right;
1021 catcher.re.conj_left.cont = &next;
1022 catcher.re.conj_left.beg = s;
1023 catcher.next = 0;
1024 return parse(env, rex->re.group.expr.binary.left, &catcher, s);
1025 case REX_CONJ_LEFT:
1026 rex->re.conj_left.cont->re.conj_right.end = s;
1027 cont = rex->re.conj_left.cont;
1028 s = rex->re.conj_left.beg;
1029 rex = rex->re.conj_left.right;
1030 continue;
1031 case REX_CONJ_RIGHT:
1032 if (rex->re.conj_right.end != s)
1033 return NONE;
1034 cont = rex->re.conj_right.cont;
1035 break;
1036 case REX_DONE:
1037 if (!env->stack)
1038 return BEST;
1039 n = s - env->beg;
1040 r = env->nsub;
1041 DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%d,%d)(%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0));
1042 if ((i = env->best[0].rm_eo) >= 0)
1043 {
1044 if (rex->flags & REG_MINIMAL)
1045 {
1046 if (n > i)
1047 return GOOD;
1048 }
1049 else
1050 {
1051 if (n < i)
1052 return GOOD;
1053 }
1054 if (n == i && better(env,
1055 (Pos_t*)env->bestpos->vec,
1056 (Pos_t*)env->pos->vec,
1057 (Pos_t*)env->bestpos->vec+env->bestpos->cur,
1058 (Pos_t*)env->pos->vec+env->pos->cur,
1059 0) <= 0)
1060 return GOOD;
1061 }
1062 env->best[0].rm_eo = n;
1063 memcpy(&env->best[1], &env->match[1], r * sizeof(regmatch_t));
1064 n = env->pos->cur;
1065 if (!vector(Pos_t, env->bestpos, n))
1066 {
1067 env->error = REG_ESPACE;
1068 return BAD;
1069 }
1070 env->bestpos->cur = n;
1071 memcpy(env->bestpos->vec, env->pos->vec, n * sizeof(Pos_t));
1072 DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%d,%d)(%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0));
1073 return GOOD;
1074 case REX_DOT:
1075 if (LEADING(env, rex, s))
1076 return NONE;
1077 n = rex->hi;
1078 if (n > env->end - s)
1079 n = env->end - s;
1080 m = rex->lo;
1081 if (m > n)
1082 return NONE;
1083 if ((c = rex->explicit) >= 0 && !mbwide())
1084 for (i = 0; i < n; i++)
1085 if (s[i] == c)
1086 {
1087 n = i;
1088 break;
1089 }
1090 r = NONE;
1091 if (!(rex->flags & REG_MINIMAL))
1092 {
1093 if (!mbwide())
1094 {
1095 for (s += n; n-- >= m; s--)
1096 switch (follow(env, rex, cont, s))
1097 {
1098 case BAD:
1099 return BAD;
1100 case CUT:
1101 return CUT;
1102 case BEST:
1103 return BEST;
1104 case GOOD:
1105 r = GOOD;
1106 break;
1107 }
1108 }
1109 else
1110 {
1111 if (!(b = (unsigned char*)stkpush(stkstd, n)))
1112 {
1113 env->error = REG_ESPACE;
1114 return BAD;
1115 }
1116 e = env->end;
1117 for (i = 0; s < e && i < n && *s != c; i++)
1118 s += b[i] = MBSIZE(s);
1119 for (; i-- >= m; s -= b[i])
1120 switch (follow(env, rex, cont, s))
1121 {
1122 case BAD:
1123 stkpop(stkstd);
1124 return BAD;
1125 case CUT:
1126 stkpop(stkstd);
1127 return CUT;
1128 case BEST:
1129 stkpop(stkstd);
1130 return BEST;
1131 case GOOD:
1132 r = GOOD;
1133 break;
1134 }
1135 stkpop(stkstd);
1136 }
1137 }
1138 else
1139 {
1140 if (!mbwide())
1141 {
1142 e = s + n;
1143 for (s += m; s <= e; s++)
1144 switch (follow(env, rex, cont, s))
1145 {
1146 case BAD:
1147 return BAD;
1148 case CUT:
1149 return CUT;
1150 case BEST:
1151 case GOOD:
1152 return BEST;
1153 }
1154 }
1155 else
1156 {
1157 e = env->end;
1158 for (i = 0; s < e && i < m && *s != c; i++)
1159 s += MBSIZE(s);
1160 if (i >= m)
1161 for (; s <= e && i <= n; s += MBSIZE(s), i++)
1162 switch (follow(env, rex, cont, s))
1163 {
1164 case BAD:
1165 return BAD;
1166 case CUT:
1167 return CUT;
1168 case BEST:
1169 case GOOD:
1170 return BEST;
1171 }
1172 }
1173 }
1174 return r;
1175 case REX_END:
1176 if ((!(rex->flags & REG_NEWLINE) || *s != '\n') && ((env->flags & REG_NOTEOL) || s < env->end))
1177 return NONE;
1178 break;
1179 case REX_GROUP:
1180 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0));
1181 if (env->stack)
1182 {
1183 if (rex->re.group.number)
1184 env->match[rex->re.group.number].rm_so = s - env->beg;
1185 if (pospush(env, rex, s, BEG_SUB))
1186 return BAD;
1187 catcher.re.group_catch.eo = rex->re.group.number ? &env->match[rex->re.group.number].rm_eo : (regoff_t*)0;
1188 }
1189 catcher.type = REX_GROUP_CATCH;
1190 catcher.serial = rex->serial;
1191 catcher.re.group_catch.cont = cont;
1192 catcher.next = rex->next;
1193 r = parse(env, rex->re.group.expr.rex, &catcher, s);
1194 if (env->stack)
1195 {
1196 pospop(env);
1197 if (rex->re.group.number)
1198 env->match[rex->re.group.number].rm_so = -1;
1199 }
1200 return r;
1201 case REX_GROUP_CATCH:
1202 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s=>%s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rexname(rex->re.group_catch.cont), env->end - s, s)),(0));
1203 if (env->stack)
1204 {
1205 if (rex->re.group_catch.eo)
1206 *rex->re.group_catch.eo = s - env->beg;
1207 if (pospush(env, rex, s, END_ANY))
1208 return BAD;
1209 }
1210 r = follow(env, rex, rex->re.group_catch.cont, s);
1211 if (env->stack)
1212 {
1213 pospop(env);
1214 if (rex->re.group_catch.eo)
1215 *rex->re.group_catch.eo = -1;
1216 }
1217 return r;
1218 case REX_GROUP_AHEAD:
1219 catcher.type = REX_GROUP_AHEAD_CATCH;
1220 catcher.flags = rex->flags;
1221 catcher.serial = rex->serial;
1222 catcher.re.rep_catch.beg = s;
1223 catcher.re.rep_catch.cont = cont;
1224 catcher.next = rex->next;
1225 return parse(env, rex->re.group.expr.rex, &catcher, s);
1226 case REX_GROUP_AHEAD_CATCH:
1227 return follow(env, rex, rex->re.rep_catch.cont, rex->re.rep_catch.beg);
1228 case REX_GROUP_AHEAD_NOT:
1229 r = parse(env, rex->re.group.expr.rex, NiL, s);
1230 if (r == NONE)
1231 r = follow(env, rex, cont, s);
1232 else if (r != BAD)
1233 r = NONE;
1234 return r;
1235 case REX_GROUP_BEHIND:
1236 if ((s - env->beg) < rex->re.group.size)
1237 return NONE;
1238 catcher.type = REX_GROUP_BEHIND_CATCH;
1239 catcher.flags = rex->flags;
1240 catcher.serial = rex->serial;
1241 catcher.re.behind_catch.beg = s;
1242 catcher.re.behind_catch.end = e = env->end;
1243 catcher.re.behind_catch.cont = cont;
1244 catcher.next = rex->next;
1245 for (t = s - rex->re.group.size; t >= env->beg; t--)
1246 {
1247 env->end = s;
1248 r = parse(env, rex->re.group.expr.rex, &catcher, t);
1249 env->end = e;
1250 if (r != NONE)
1251 return r;
1252 }
1253 return NONE;
1254 case REX_GROUP_BEHIND_CATCH:
1255 if (s != rex->re.behind_catch.beg)
1256 return NONE;
1257 env->end = rex->re.behind_catch.end;
1258 return follow(env, rex, rex->re.behind_catch.cont, rex->re.behind_catch.beg);
1259 case REX_GROUP_BEHIND_NOT:
1260 if ((s - env->beg) < rex->re.group.size)
1261 r = NONE;
1262 else
1263 {
1264 catcher.type = REX_GROUP_BEHIND_NOT_CATCH;
1265 catcher.re.neg_catch.beg = s;
1266 catcher.next = 0;
1267 e = env->end;
1268 env->end = s;
1269 for (t = s - rex->re.group.size; t >= env->beg; t--)
1270 {
1271 r = parse(env, rex->re.group.expr.rex, &catcher, t);
1272 if (r != NONE)
1273 break;
1274 }
1275 env->end = e;
1276 }
1277 if (r == NONE)
1278 r = follow(env, rex, cont, s);
1279 else if (r != BAD)
1280 r = NONE;
1281 return r;
1282 case REX_GROUP_BEHIND_NOT_CATCH:
1283 return s == rex->re.neg_catch.beg ? GOOD : NONE;
1284 case REX_GROUP_COND:
1285 if (q = rex->re.group.expr.binary.right)
1286 {
1287 catcher.re.cond_catch.next[0] = q->re.group.expr.binary.right;
1288 catcher.re.cond_catch.next[1] = q->re.group.expr.binary.left;
1289 }
1290 else
1291 catcher.re.cond_catch.next[0] = catcher.re.cond_catch.next[1] = 0;
1292 if (q = rex->re.group.expr.binary.left)
1293 {
1294 catcher.type = REX_GROUP_COND_CATCH;
1295 catcher.flags = rex->flags;
1296 catcher.serial = rex->serial;
1297 catcher.re.cond_catch.yes = 0;
1298 catcher.re.cond_catch.beg = s;
1299 catcher.re.cond_catch.cont = cont;
1300 catcher.next = rex->next;
1301 r = parse(env, q, &catcher, s);
1302 if (r == BAD || catcher.re.cond_catch.yes)
1303 return r;
1304 }
1305 else if (!rex->re.group.size || rex->re.group.size > 0 && env->match[rex->re.group.size].rm_so >= 0)
1306 r = GOOD;
1307 else
1308 r = NONE;
1309 if (q = catcher.re.cond_catch.next[r != NONE])
1310 {
1311 catcher.type = REX_CAT;
1312 catcher.flags = q->flags;
1313 catcher.serial = q->serial;
1314 catcher.re.group_catch.cont = cont;
1315 catcher.next = rex->next;
1316 return parse(env, q, &catcher, s);
1317 }
1318 return follow(env, rex, cont, s);
1319 case REX_GROUP_COND_CATCH:
1320 rex->re.cond_catch.yes = 1;
1321 catcher.type = REX_CAT;
1322 catcher.flags = rex->flags;
1323 catcher.serial = rex->serial;
1324 catcher.re.group_catch.cont = rex->re.cond_catch.cont;
1325 catcher.next = rex->next;
1326 return parse(env, rex->re.cond_catch.next[1], &catcher, rex->re.cond_catch.beg);
1327 case REX_CAT:
1328 return follow(env, rex, rex->re.group_catch.cont, s);
1329 case REX_GROUP_CUT:
1330 catcher.type = REX_GROUP_CUT_CATCH;
1331 catcher.flags = rex->flags;
1332 catcher.serial = rex->serial;
1333 catcher.re.group_catch.cont = cont;
1334 catcher.next = rex->next;
1335 return parse(env, rex->re.group.expr.rex, &catcher, s);
1336 case REX_GROUP_CUT_CATCH:
1337 switch (r = follow(env, rex, rex->re.group_catch.cont, s))
1338 {
1339 case GOOD:
1340 r = BEST;
1341 break;
1342 case NONE:
1343 r = CUT;
1344 break;
1345 }
1346 return r;
1347 case REX_KMP:
1348 f = rex->re.string.fail;
1349 b = rex->re.string.base;
1350 n = rex->re.string.size;
1351 t = s;
1352 e = env->end;
1353 if (p = rex->map)
1354 {
1355 while (t + n <= e)
1356 {
1357 for (i = -1; t < e; t++)
1358 {
1359 while (i >= 0 && b[i+1] != p[*t])
1360 i = f[i];
1361 if (b[i+1] == p[*t])
1362 i++;
1363 if (i + 1 == n)
1364 {
1365 t++;
1366 if (env->stack)
1367 env->best[0].rm_so = t - s - n;
1368 switch (follow(env, rex, cont, t))
1369 {
1370 case BAD:
1371 return BAD;
1372 case CUT:
1373 return CUT;
1374 case BEST:
1375 case GOOD:
1376 return BEST;
1377 }
1378 t -= n - 1;
1379 break;
1380 }
1381 }
1382 }
1383 }
1384 else
1385 {
1386 while (t + n <= e)
1387 {
1388 for (i = -1; t < e; t++)
1389 {
1390 while (i >= 0 && b[i+1] != *t)
1391 i = f[i];
1392 if (b[i+1] == *t)
1393 i++;
1394 if (i + 1 == n)
1395 {
1396 t++;
1397 if (env->stack)
1398 env->best[0].rm_so = t - s - n;
1399 switch (follow(env, rex, cont, t))
1400 {
1401 case BAD:
1402 return BAD;
1403 case CUT:
1404 return CUT;
1405 case BEST:
1406 case GOOD:
1407 return BEST;
1408 }
1409 t -= n - 1;
1410 break;
1411 }
1412 }
1413 }
1414 }
1415 return NONE;
1416 case REX_NEG:
1417 if (LEADING(env, rex, s))
1418 return NONE;
1419 i = env->end - s;
1420 n = ((i + 7) >> 3) + 1;
1421 catcher.type = REX_NEG_CATCH;
1422 catcher.re.neg_catch.beg = s;
1423 if (!(p = (unsigned char*)stkpush(stkstd, n)))
1424 return BAD;
1425 memset(catcher.re.neg_catch.index = p, 0, n);
1426 catcher.next = rex->next;
1427 if (parse(env, rex->re.group.expr.rex, &catcher, s) == BAD)
1428 r = BAD;
1429 else
1430 {
1431 r = NONE;
1432 for (; i >= 0; i--)
1433 if (!bittst(p, i))
1434 {
1435 switch (follow(env, rex, cont, s + i))
1436 {
1437 case BAD:
1438 r = BAD;
1439 break;
1440 case BEST:
1441 r = BEST;
1442 break;
1443 case CUT:
1444 r = CUT;
1445 break;
1446 case GOOD:
1447 r = GOOD;
1448 /*FALLTHROUGH*/
1449 default:
1450 continue;
1451 }
1452 break;
1453 }
1454 }
1455 stkpop(stkstd);
1456 return r;
1457 case REX_NEG_CATCH:
1458 bitset(rex->re.neg_catch.index, s - rex->re.neg_catch.beg);
1459 return NONE;
1460 case REX_NEST:
1461 if (s >= env->end)
1462 return NONE;
1463 do
1464 {
1465 if ((c = *s++) == rex->re.nest.primary)
1466 {
1467 if (s >= env->end || !(s = nestmatch(s, env->end, rex->re.nest.type, c)))
1468 return NONE;
1469 break;
1470 }
1471 if (rex->re.nest.primary >= 0)
1472 return NONE;
1473 if (rex->re.nest.type[c] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))
1474 break;
1475 if (!(s = nestmatch(s, env->end, rex->re.nest.type, c)))
1476 return NONE;
1477 } while (s < env->end && !(rex->re.nest.type[*(s-1)] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)));
1478 break;
1479 case REX_NULL:
1480 break;
1481 case REX_ONECHAR:
1482 n = rex->hi;
1483 if (n > env->end - s)
1484 n = env->end - s;
1485 m = rex->lo;
1486 if (m > n)
1487 return NONE;
1488 r = NONE;
1489 c = rex->re.onechar;
1490 if (!(rex->flags & REG_MINIMAL))
1491 {
1492 if (!mbwide())
1493 {
1494 if (p = rex->map)
1495 {
1496 for (i = 0; i < n; i++, s++)
1497 if (p[*s] != c)
1498 break;
1499 }
1500 else
1501 {
1502 for (i = 0; i < n; i++, s++)
1503 if (*s != c)
1504 break;
1505 }
1506 for (; i-- >= m; s--)
1507 switch (follow(env, rex, cont, s))
1508 {
1509 case BAD:
1510 return BAD;
1511 case BEST:
1512 return BEST;
1513 case CUT:
1514 return CUT;
1515 case GOOD:
1516 r = GOOD;
1517 break;
1518 }
1519 }
1520 else
1521 {
1522 if (!(b = (unsigned char*)stkpush(stkstd, n)))
1523 {
1524 env->error = REG_ESPACE;
1525 return BAD;
1526 }
1527 e = env->end;
1528 if (!(rex->flags & REG_ICASE))
1529 {
1530 for (i = 0; s < e && i < n; i++, s = t)
1531 {
1532 t = s;
1533 if (mbchar(t) != c)
1534 break;
1535 b[i] = t - s;
1536 }
1537 }
1538 else
1539 {
1540 for (i = 0; s < e && i < n; i++, s = t)
1541 {
1542 t = s;
1543 if (towupper(mbchar(t)) != c)
1544 break;
1545 b[i] = t - s;
1546 }
1547 }
1548 for (; i-- >= m; s -= b[i])
1549 switch (follow(env, rex, cont, s))
1550 {
1551 case BAD:
1552 stkpop(stkstd);
1553 return BAD;
1554 case BEST:
1555 stkpop(stkstd);
1556 return BEST;
1557 case CUT:
1558 stkpop(stkstd);
1559 return CUT;
1560 case GOOD:
1561 r = GOOD;
1562 break;
1563 }
1564 stkpop(stkstd);
1565 }
1566 }
1567 else
1568 {
1569 if (!mbwide())
1570 {
1571 e = s + m;
1572 if (p = rex->map)
1573 {
1574 for (; s < e; s++)
1575 if (p[*s] != c)
1576 return r;
1577 e += n - m;
1578 for (;;)
1579 {
1580 switch (follow(env, rex, cont, s))
1581 {
1582 case BAD:
1583 return BAD;
1584 case CUT:
1585 return CUT;
1586 case BEST:
1587 case GOOD:
1588 return BEST;
1589 }
1590 if (s >= e || p[*s++] != c)
1591 break;
1592 }
1593 }
1594 else
1595 {
1596 for (; s < e; s++)
1597 if (*s != c)
1598 return r;
1599 e += n - m;
1600 for (;;)
1601 {
1602 switch (follow(env, rex, cont, s))
1603 {
1604 case BAD:
1605 return BAD;
1606 case CUT:
1607 return CUT;
1608 case BEST:
1609 case GOOD:
1610 return BEST;
1611 }
1612 if (s >= e || *s++ != c)
1613 break;
1614 }
1615 }
1616 }
1617 else
1618 {
1619 if (!(rex->flags & REG_ICASE))
1620 {
1621 for (i = 0; i < m && s < e; i++, s = t)
1622 {
1623 t = s;
1624 if (mbchar(t) != c)
1625 return r;
1626 }
1627 while (i++ <= n)
1628 {
1629 switch (follow(env, rex, cont, s))
1630 {
1631 case BAD:
1632 return BAD;
1633 case CUT:
1634 return CUT;
1635 case BEST:
1636 case GOOD:
1637 return BEST;
1638 }
1639 if (s >= e || mbchar(s) != c)
1640 break;
1641 }
1642 }
1643 else
1644 {
1645 for (i = 0; i < m && s < e; i++, s = t)
1646 {
1647 t = s;
1648 if (towupper(mbchar(t)) != c)
1649 return r;
1650 }
1651 while (i++ <= n)
1652 {
1653 switch (follow(env, rex, cont, s))
1654 {
1655 case BAD:
1656 return BAD;
1657 case CUT:
1658 return CUT;
1659 case BEST:
1660 case GOOD:
1661 return BEST;
1662 }
1663 if (s >= e || towupper(mbchar(s)) != c)
1664 break;
1665 }
1666 }
1667 }
1668 }
1669 return r;
1670 case REX_REP:
1671 if (env->stack && pospush(env, rex, s, BEG_REP))
1672 return BAD;
1673 r = parserep(env, rex, cont, s, 0);
1674 if (env->stack)
1675 pospop(env);
1676 return r;
1677 case REX_REP_CATCH:
1678 DEBUG_TEST(0x0020,(sfprintf(sfstdout, "AHA#%04d 0x%04x %s n %d len %d s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.rep_catch.n, s - rex->re.rep_catch.beg, env->end - s, s)),(0));
1679 if (env->stack && pospush(env, rex, s, END_ANY))
1680 return BAD;
1681 if (s == rex->re.rep_catch.beg && rex->re.rep_catch.n > rex->re.rep_catch.ref->lo)
1682 {
1683 /*
1684 * optional empty iteration
1685 */
1686
1687 DEBUG_TEST(0x0002,(sfprintf(sfstdout, "AHA#%04d %p re.group.back=%d re.group.expr.rex=%s\n", __LINE__, rex->re.rep_catch.ref->re.group.expr.rex, rex->re.rep_catch.ref->re.group.expr.rex->re.group.back, rexname(rex->re.rep_catch.ref->re.group.expr.rex))),(0));
1688 if (!env->stack || s != rex->re.rep_catch.ref->re.rep_catch.beg && !rex->re.rep_catch.ref->re.group.expr.rex->re.group.back)
1689 r = NONE;
1690 else if (pospush(env, rex, s, END_ANY))
1691 r = BAD;
1692 else
1693 {
1694 r = follow(env, rex, rex->re.rep_catch.cont, s);
1695 pospop(env);
1696 }
1697 }
1698 else
1699 r = parserep(env, rex->re.rep_catch.ref, rex->re.rep_catch.cont, s, rex->re.rep_catch.n);
1700 if (env->stack)
1701 pospop(env);
1702 return r;
1703 case REX_STRING:
1704 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s \"%-.*s\" `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.string.size, rex->re.string.base, env->end - s, s)),(0));
1705 if (rex->re.string.size > (env->end - s))
1706 return NONE;
1707 t = rex->re.string.base;
1708 e = t + rex->re.string.size;
1709 if (!(p = rex->map))
1710 {
1711 while (t < e)
1712 if (*s++ != *t++)
1713 return NONE;
1714 }
1715 else if (!mbwide())
1716 {
1717 while (t < e)
1718 if (p[*s++] != *t++)
1719 return NONE;
1720 }
1721 else
1722 {
1723 while (t < e)
1724 {
1725 c = mbchar(s);
1726 d = mbchar(t);
1727 if (towupper(c) != d)
1728 return NONE;
1729 }
1730 }
1731 break;
1732 case REX_TRIE:
1733 if (((s + rex->re.trie.min) > env->end) || !(x = rex->re.trie.root[rex->map ? rex->map[*s] : *s]))
1734 return NONE;
1735 return parsetrie(env, x, rex, cont, s);
1736 case REX_EXEC:
1737 u = 0;
1738 r = (*env->disc->re_execf)(env->regex, rex->re.exec.data, rex->re.exec.text, rex->re.exec.size, (const char*)s, env->end - s, &u, env->disc);
1739 e = (unsigned char*)u;
1740 if (e >= s && e <= env->end)
1741 s = e;
1742 switch (r)
1743 {
1744 case 0:
1745 break;
1746 case REG_NOMATCH:
1747 return NONE;
1748 default:
1749 env->error = r;
1750 return BAD;
1751 }
1752 break;
1753 case REX_WBEG:
1754 if (!isword(*s) || s > env->beg && isword(*(s - 1)))
1755 return NONE;
1756 break;
1757 case REX_WEND:
1758 if (isword(*s) || s > env->beg && !isword(*(s - 1)))
1759 return NONE;
1760 break;
1761 case REX_WORD:
1762 if (s > env->beg && isword(*(s - 1)) == isword(*s))
1763 return NONE;
1764 break;
1765 case REX_WORD_NOT:
1766 if (s == env->beg || isword(*(s - 1)) != isword(*s))
1767 return NONE;
1768 break;
1769 case REX_BEG_STR:
1770 if (s != env->beg)
1771 return NONE;
1772 break;
1773 case REX_END_STR:
1774 for (t = s; t < env->end && *t == '\n'; t++);
1775 if (t < env->end)
1776 return NONE;
1777 break;
1778 case REX_FIN_STR:
1779 if (s < env->end)
1780 return NONE;
1781 break;
1782 }
1783 if (!(rex = rex->next))
1784 {
1785 if (!(rex = cont))
1786 break;
1787 cont = 0;
1788 }
1789 }
1790 return GOOD;
1791 }
1792
1793 #if _AST_REGEX_DEBUG
1794
1795 static void
listnode(Rex_t * e,int level)1796 listnode(Rex_t* e, int level)
1797 {
1798 int i;
1799
1800 if (e)
1801 {
1802 do
1803 {
1804 for (i = 0; i < level; i++)
1805 sfprintf(sfstderr, " ");
1806 sfprintf(sfstderr, "%s\n", rexname(e));
1807 switch (e->type)
1808 {
1809 case REX_ALT:
1810 case REX_CONJ:
1811 listnode(e->re.group.expr.binary.left, level + 1);
1812 listnode(e->re.group.expr.binary.right, level + 1);
1813 break;
1814 case REX_GROUP:
1815 case REX_GROUP_AHEAD:
1816 case REX_GROUP_AHEAD_NOT:
1817 case REX_GROUP_BEHIND:
1818 case REX_GROUP_BEHIND_NOT:
1819 case REX_GROUP_CUT:
1820 case REX_NEG:
1821 case REX_REP:
1822 listnode(e->re.group.expr.rex, level + 1);
1823 break;
1824 }
1825 } while (e = e->next);
1826 }
1827 }
1828
1829 static int
list(Env_t * env,Rex_t * rex)1830 list(Env_t* env, Rex_t* rex)
1831 {
1832 sfprintf(sfstderr, "AHA regex hard=%d stack=%p\n", env->hard, env->stack);
1833 if (rex)
1834 listnode(rex, 1);
1835 return 0;
1836 }
1837
1838 #endif
1839
1840 /*
1841 * returning REG_BADPAT or REG_ESPACE is not explicitly
1842 * countenanced by the standard
1843 */
1844
1845 int
regnexec(const regex_t * p,const char * s,size_t len,size_t nmatch,regmatch_t * match,regflags_t flags)1846 regnexec(const regex_t* p, const char* s, size_t len, size_t nmatch, regmatch_t* match, regflags_t flags)
1847 {
1848 register int n;
1849 register int i;
1850 int j;
1851 int k;
1852 int m;
1853 int advance;
1854 Env_t* env;
1855 Rex_t* e;
1856
1857 DEBUG_INIT();
1858 DEBUG_TEST(0x0001,(sfprintf(sfstdout, "AHA#%04d 0x%04x regnexec %d 0x%08x `%-.*s'\n", __LINE__, debug_flag, nmatch, flags, len, s)),(0));
1859 if (!p || !(env = p->env))
1860 return REG_BADPAT;
1861 if (!s)
1862 return fatal(env->disc, REG_BADPAT, NiL);
1863 if (len < env->min)
1864 {
1865 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, env->min)),(0));
1866 return REG_NOMATCH;
1867 }
1868 env->regex = p;
1869 env->beg = (unsigned char*)s;
1870 env->end = env->beg + len;
1871 stknew(stkstd, &env->stk);
1872 env->flags &= ~REG_EXEC;
1873 env->flags |= (flags & REG_EXEC);
1874 advance = 0;
1875 if (env->stack = env->hard || !(env->flags & REG_NOSUB) && nmatch)
1876 {
1877 n = env->nsub;
1878 if (!(env->match = (regmatch_t*)stkpush(stkstd, 2 * (n + 1) * sizeof(regmatch_t))) ||
1879 !env->pos && !(env->pos = vecopen(16, sizeof(Pos_t))) ||
1880 !env->bestpos && !(env->bestpos = vecopen(16, sizeof(Pos_t))))
1881 {
1882 k = REG_ESPACE;
1883 goto done;
1884 }
1885 env->pos->cur = env->bestpos->cur = 0;
1886 env->best = &env->match[n + 1];
1887 env->best[0].rm_so = 0;
1888 env->best[0].rm_eo = -1;
1889 for (i = 0; i <= n; i++)
1890 env->match[i] = state.nomatch;
1891 if (flags & REG_ADVANCE)
1892 advance = 1;
1893 }
1894 DEBUG_TEST(0x1000,(list(env,env->rex)),(0));
1895 k = REG_NOMATCH;
1896 if ((e = env->rex)->type == REX_BM)
1897 {
1898 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM\n", __LINE__)),(0));
1899 if (len < e->re.bm.right)
1900 {
1901 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, e->re.bm.right)),(0));
1902 goto done;
1903 }
1904 else if (!(flags & REG_LEFT))
1905 {
1906 register unsigned char* buf = (unsigned char*)s;
1907 register size_t index = e->re.bm.left + e->re.bm.size;
1908 register size_t mid = len - e->re.bm.right;
1909 register size_t* skip = e->re.bm.skip;
1910 register size_t* fail = e->re.bm.fail;
1911 register Bm_mask_t** mask = e->re.bm.mask;
1912 Bm_mask_t m;
1913 size_t x;
1914
1915 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM len=%d right=%d left=%d size=%d %d %d\n", __LINE__, len, e->re.bm.right, e->re.bm.left, e->re.bm.size, index, mid)),(0));
1916 for (;;)
1917 {
1918 while ((index += skip[buf[index]]) < mid);
1919 if (index < HIT)
1920 {
1921 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, index, HIT)),(0));
1922 goto done;
1923 }
1924 index -= HIT;
1925 m = mask[n = e->re.bm.size - 1][buf[index]];
1926 do
1927 {
1928 if (!n--)
1929 {
1930 if (e->re.bm.back < 0)
1931 goto possible;
1932 if (advance)
1933 {
1934 i = index - e->re.bm.back;
1935 s += i;
1936 if (env->stack)
1937 env->best[0].rm_so += i;
1938 goto possible;
1939 }
1940 x = index;
1941 if (index < e->re.bm.back)
1942 index = 0;
1943 else
1944 index -= e->re.bm.back;
1945 while (index <= x)
1946 {
1947 if ((i = parse(env, e->next, &env->done, buf + index)) != NONE)
1948 {
1949 if (env->stack)
1950 env->best[0].rm_so = index;
1951 n = env->nsub;
1952 goto hit;
1953 }
1954 index++;
1955 }
1956 index += e->re.bm.size;
1957 break;
1958 }
1959 } while (m &= mask[n][buf[--index]]);
1960 if ((index += fail[n + 1]) >= len)
1961 goto done;
1962 }
1963 }
1964 possible:
1965 n = env->nsub;
1966 e = e->next;
1967 }
1968 j = env->once || (flags & REG_LEFT);
1969 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d parse once=%d\n", __LINE__, j)),(0));
1970 while ((i = parse(env, e, &env->done, (unsigned char*)s)) == NONE || advance && !env->best[0].rm_eo && !(advance = 0))
1971 {
1972 if (j)
1973 goto done;
1974 i = MBSIZE(s);
1975 s += i;
1976 if ((unsigned char*)s > env->end - env->min)
1977 goto done;
1978 if (env->stack)
1979 env->best[0].rm_so += i;
1980 }
1981 if ((flags & REG_LEFT) && env->stack && env->best[0].rm_so)
1982 goto done;
1983 hit:
1984 if (k = env->error)
1985 goto done;
1986 if (i == CUT)
1987 {
1988 k = env->error = REG_NOMATCH;
1989 goto done;
1990 }
1991 if (!(env->flags & REG_NOSUB))
1992 {
1993 k = (env->flags & (REG_SHELL|REG_AUGMENTED)) == (REG_SHELL|REG_AUGMENTED);
1994 for (i = j = m = 0; j < nmatch; i++)
1995 if (!i || !k || (i & 1))
1996 {
1997 if (i > n)
1998 match[j] = state.nomatch;
1999 else
2000 match[m = j] = env->best[i];
2001 j++;
2002 }
2003 if (k)
2004 {
2005 while (m > 0 && match[m].rm_so == -1 && match[m].rm_eo == -1)
2006 m--;
2007 ((regex_t*)p)->re_nsub = m;
2008 }
2009 }
2010 k = 0;
2011 done:
2012 stkold(stkstd, &env->stk);
2013 env->stk.base = 0;
2014 if (k > REG_NOMATCH)
2015 fatal(p->env->disc, k, NiL);
2016 return k;
2017 }
2018
2019 void
regfree(regex_t * p)2020 regfree(regex_t* p)
2021 {
2022 Env_t* env;
2023
2024 if (p && (env = p->env))
2025 {
2026 #if _REG_subcomp
2027 if (env->sub)
2028 {
2029 regsubfree(p);
2030 p->re_sub = 0;
2031 }
2032 #endif
2033 p->env = 0;
2034 if (--env->refs <= 0 && !(env->disc->re_flags & REG_NOFREE))
2035 {
2036 drop(env->disc, env->rex);
2037 if (env->pos)
2038 vecclose(env->pos);
2039 if (env->bestpos)
2040 vecclose(env->bestpos);
2041 if (env->stk.base)
2042 stkold(stkstd, &env->stk);
2043 alloc(env->disc, env, 0);
2044 }
2045 }
2046 }
2047