1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1985-2012 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Eclipse Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.eclipse.org/org/documents/epl-v10.html *
11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * Phong Vo <kpv@research.att.com> *
20 * *
21 ***********************************************************************/
22 #pragma prototyped
23
24 /*
25 * posix regex executor
26 * single sized-record interface
27 */
28
29 #include "reglib.h"
30
31 #if _AST_REGEX_DEBUG
32
33 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n))
34 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0)
35 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_exec_debug")) debug |= strtoul(t, NiL, 0); } } while (0)
36
37 static unsigned long debug;
38 static unsigned long debug_flag;
39
40 static const char* rexnames[] =
41 {
42 "REX_NULL",
43 "REX_ALT",
44 "REX_ALT_CATCH",
45 "REX_BACK",
46 "REX_BEG",
47 "REX_BEG_STR",
48 "REX_BM",
49 "REX_CAT",
50 "REX_CLASS",
51 "REX_COLL_CLASS",
52 "REX_CONJ",
53 "REX_CONJ_LEFT",
54 "REX_CONJ_RIGHT",
55 "REX_DONE",
56 "REX_DOT",
57 "REX_END",
58 "REX_END_STR",
59 "REX_EXEC",
60 "REX_FIN_STR",
61 "REX_GROUP",
62 "REX_GROUP_CATCH",
63 "REX_GROUP_AHEAD",
64 "REX_GROUP_AHEAD_CATCH",
65 "REX_GROUP_AHEAD_NOT",
66 "REX_GROUP_BEHIND",
67 "REX_GROUP_BEHIND_CATCH",
68 "REX_GROUP_BEHIND_NOT",
69 "REX_GROUP_BEHIND_NOT_CATCH",
70 "REX_GROUP_COND",
71 "REX_GROUP_COND_CATCH",
72 "REX_GROUP_CUT",
73 "REX_GROUP_CUT_CATCH",
74 "REX_KMP",
75 "REX_NEG",
76 "REX_NEG_CATCH",
77 "REX_NEST",
78 "REX_ONECHAR",
79 "REX_REP",
80 "REX_REP_CATCH",
81 "REX_STRING",
82 "REX_TRIE",
83 "REX_WBEG",
84 "REX_WEND",
85 "REX_WORD",
86 "REX_WORD_NOT"
87 };
88
rexname(Rex_t * rex)89 static const char* rexname(Rex_t* rex)
90 {
91 if (!rex)
92 return "NIL";
93 if (rex->type >= elementsof(rexnames))
94 return "ERROR";
95 return rexnames[rex->type];
96 }
97
98 #else
99
100 #define DEBUG_INIT()
101 #define DEBUG_TEST(f,y,n) (n)
102 #define DEBUG_CODE(f,y,n) do {n} while(0)
103
104 #endif
105
106 #define BEG_ALT 1 /* beginning of an alt */
107 #define BEG_ONE 2 /* beginning of one iteration of a rep */
108 #define BEG_REP 3 /* beginning of a repetition */
109 #define BEG_SUB 4 /* beginning of a subexpression */
110 #define END_ANY 5 /* end of any of above */
111
112 /*
113 * returns from parse()
114 */
115
116 #define NONE 0 /* no parse found */
117 #define GOOD 1 /* some parse was found */
118 #define CUT 2 /* no match and no backtrack */
119 #define BEST 3 /* an unbeatable parse was found */
120 #define BAD 4 /* error ocurred */
121
122 /*
123 * REG_SHELL_DOT test
124 */
125
126 #define LEADING(e,r,s) (*(s)==(e)->leading&&((s)==(e)->beg||*((s)-1)==(r)->explicit))
127
128 /*
129 * Pos_t is for comparing parses. An entry is made in the
130 * array at the beginning and at the end of each Group_t,
131 * each iteration in a Group_t, and each Binary_t.
132 */
133
134 typedef struct
135 {
136 unsigned char* p; /* where in string */
137 size_t length; /* length in string */
138 short serial; /* preorder subpattern number */
139 short be; /* which end of pair */
140 } Pos_t;
141
142 /* ===== begin library support ===== */
143
144 #define vector(t,v,i) (((i)<(v)->max)?(t*)((v)->vec+(i)*(v)->siz):(t*)vecseek(&(v),i))
145
146 static Vector_t*
vecopen(int inc,int siz)147 vecopen(int inc, int siz)
148 {
149 Vector_t* v;
150 Stk_t* sp;
151
152 if (inc <= 0)
153 inc = 16;
154 if (!(sp = stkopen(STK_SMALL|STK_NULL)))
155 return 0;
156 if (!(v = (Vector_t*)stkseek(sp, sizeof(Vector_t) + inc * siz)))
157 {
158 stkclose(sp);
159 return 0;
160 }
161 v->stk = sp;
162 v->vec = (char*)v + sizeof(Vector_t);
163 v->max = v->inc = inc;
164 v->siz = siz;
165 v->cur = 0;
166 return v;
167 }
168
169 static void*
vecseek(Vector_t ** p,int index)170 vecseek(Vector_t** p, int index)
171 {
172 Vector_t* v = *p;
173
174 if (index >= v->max)
175 {
176 while ((v->max += v->inc) <= index);
177 if (!(v = (Vector_t*)stkseek(v->stk, sizeof(Vector_t) + v->max * v->siz)))
178 return 0;
179 *p = v;
180 v->vec = (char*)v + sizeof(Vector_t);
181 }
182 return v->vec + index * v->siz;
183 }
184
185 static void
vecclose(Vector_t * v)186 vecclose(Vector_t* v)
187 {
188 if (v)
189 stkclose(v->stk);
190 }
191
192 typedef struct
193 {
194 Stk_pos_t pos;
195 char data[1];
196 } Stk_frame_t;
197
198 #define stknew(s,p) ((p)->offset=stktell(s),(p)->base=stkfreeze(s,0))
199 #define stkold(s,p) stkset(s,(p)->base,(p)->offset)
200
201 #define stkframe(s) (*((Stk_frame_t**)stktop(s)-1))
202 #define stkdata(s,t) ((t*)stkframe(s)->data)
203 #define stkpop(s) stkold(s,&(stkframe(s)->pos))
204
205 static void*
stkpush(Stk_t * sp,size_t size)206 stkpush(Stk_t* sp, size_t size)
207 {
208 Stk_frame_t* f;
209 Stk_pos_t p;
210
211 stknew(sp, &p);
212 size = sizeof(Stk_frame_t) + sizeof(size_t) + size - 1;
213 if (!(f = (Stk_frame_t*)stkalloc(sp, sizeof(Stk_frame_t) + sizeof(Stk_frame_t*) + size - 1)))
214 return 0;
215 f->pos = p;
216 stkframe(sp) = f;
217 return f->data;
218 }
219
220 /* ===== end library support ===== */
221
222 /*
223 * Match_frame_t is for saving and restoring match records
224 * around alternate attempts, so that fossils will not be
225 * left in the match array. These are the only entries in
226 * the match array that are not otherwise guaranteed to
227 * have current data in them when they get used.
228 */
229
230 typedef struct
231 {
232 size_t size;
233 regmatch_t* match;
234 regmatch_t save[1];
235 } Match_frame_t;
236
237 #define matchpush(e,x) ((x)->re.group.number?_matchpush(e,x):0)
238 #define matchcopy(e,x) do if ((x)->re.group.number) { Match_frame_t* fp = (void*)stkframe(stkstd)->data; memcpy(fp->match, fp->save, fp->size); } while (0)
239 #define matchpop(e,x) do if ((x)->re.group.number) { Match_frame_t* fp = (void*)stkframe(stkstd)->data; memcpy(fp->match, fp->save, fp->size); stkpop(stkstd); } while (0)
240
241 #define pospop(e) (--(e)->pos->cur)
242
243 /*
244 * allocate a frame and push a match onto the stack
245 */
246
247 static int
_matchpush(Env_t * env,Rex_t * rex)248 _matchpush(Env_t* env, Rex_t* rex)
249 {
250 Match_frame_t* f;
251 regmatch_t* m;
252 regmatch_t* e;
253 regmatch_t* s;
254 int num;
255
256 if (rex->re.group.number <= 0 || (num = rex->re.group.last - rex->re.group.number + 1) <= 0)
257 num = 0;
258 if (!(f = (Match_frame_t*)stkpush(stkstd, sizeof(Match_frame_t) + (num - 1) * sizeof(regmatch_t))))
259 {
260 env->error = REG_ESPACE;
261 return 1;
262 }
263 f->size = num * sizeof(regmatch_t);
264 f->match = m = env->match + rex->re.group.number;
265 e = m + num;
266 s = f->save;
267 while (m < e)
268 {
269 *s++ = *m;
270 *m++ = state.nomatch;
271 }
272 return 0;
273 }
274
275 /*
276 * allocate a frame and push a pos onto the stack
277 */
278
279 static int
pospush(Env_t * env,Rex_t * rex,unsigned char * p,int be)280 pospush(Env_t* env, Rex_t* rex, unsigned char* p, int be)
281 {
282 Pos_t* pos;
283
284 if (!(pos = vector(Pos_t, env->pos, env->pos->cur)))
285 {
286 env->error = REG_ESPACE;
287 return 1;
288 }
289 pos->serial = rex->serial;
290 pos->p = p;
291 pos->be = be;
292 env->pos->cur++;
293 return 0;
294 }
295
296 /*
297 * two matches are known to have the same length
298 * os is start of old pos array, ns is start of new,
299 * oend and nend are end+1 pointers to ends of arrays.
300 * oe and ne are ends (not end+1) of subarrays.
301 * returns 1 if new is better, -1 if old, else 0.
302 */
303
304 static int
better(Env_t * env,Pos_t * os,Pos_t * ns,Pos_t * oend,Pos_t * nend,int level)305 better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level)
306 {
307 Pos_t* oe;
308 Pos_t* ne;
309 int k;
310 int n;
311
312 if (env->error)
313 return -1;
314 for (;;)
315 {
316 DEBUG_CODE(0x0080,{sfprintf(sfstdout, " %-*.*sold ", (level + 3) * 4, (level + 3) * 4, "");for (oe = os; oe < oend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n %-*.*snew ", (level + 3) * 4, (level + 3) * 4, "");for (oe = ns; oe < nend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n");},{0;});
317 if (ns >= nend)
318 return DEBUG_TEST(0x8000,(os < oend),(0));
319 if (os >= oend)
320 return DEBUG_TEST(0x8000,(-1),(1));
321 n = os->serial;
322 if (ns->serial > n)
323 return -1;
324 if (n > ns->serial)
325 {
326 env->error = REG_PANIC;
327 return -1;
328 }
329 if (ns->p > os->p)
330 return 1;
331 if (os->p > ns->p)
332 return -1;
333 oe = os;
334 k = 0;
335 for (;;)
336 if ((++oe)->serial == n)
337 {
338 if (oe->be != END_ANY)
339 k++;
340 else if (k-- <= 0)
341 break;
342 }
343 ne = ns;
344 k = 0;
345 for (;;)
346 if ((++ne)->serial == n)
347 {
348 if (ne->be != END_ANY)
349 k++;
350 else if (k-- <= 0)
351 break;
352 }
353 if (ne->p > oe->p)
354 return 1;
355 if (oe->p > ne->p)
356 return -1;
357 if (k = better(env, os + 1, ns + 1, oe, ne, level + 1))
358 return k;
359 os = oe + 1;
360 ns = ne + 1;
361 }
362 }
363
364 #if _AST_REGEX_DEBUG
365
366 static void
showmatch(regmatch_t * p)367 showmatch(regmatch_t* p)
368 {
369 sfputc(sfstdout, '(');
370 if (p->rm_so < 0)
371 sfputc(sfstdout, '?');
372 else
373 sfprintf(sfstdout, "%z", p->rm_so);
374 sfputc(sfstdout, ',');
375 if (p->rm_eo < 0)
376 sfputc(sfstdout, '?');
377 else
378 sfprintf(sfstdout, "%z", p->rm_eo);
379 sfputc(sfstdout, ')');
380 }
381
382 static int
_better(Env_t * env,Pos_t * os,Pos_t * ns,Pos_t * oend,Pos_t * nend,int level)383 _better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level)
384 {
385 int i;
386
387 DEBUG_CODE(0x0040,{sfprintf(sfstdout, "AHA better old ");for (i = 0; i <= env->nsub; i++)showmatch(&env->best[i]);sfprintf(sfstdout, "\n new ");for (i = 0; i <= env->nsub; i++)showmatch(&env->match[i]);sfprintf(sfstdout, "\n");},{0;});
388 i = better(env, os, ns, oend, nend, 0);
389 DEBUG_TEST(0x0040,(sfprintf(sfstdout, " %s\n", i <= 0 ? "OLD" : "NEW")),(0));
390 return i;
391 }
392
393 #define better _better
394
395 #endif
396
397 #define follow(e,r,c,s) ((r)->next?parse(e,(r)->next,c,s):(c)?parse(e,c,0,s):BEST)
398
399 static int parse(Env_t*, Rex_t*, Rex_t*, unsigned char*);
400
401 static int
parserep(Env_t * env,Rex_t * rex,Rex_t * cont,unsigned char * s,int n)402 parserep(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s, int n)
403 {
404 int i;
405 int r = NONE;
406 Rex_t catcher;
407
408 DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep %s %d %d %d %d `%-.*s'\n", __LINE__, debug_flag, rexname(rex->re.group.expr.rex), rex->re.group.number, rex->lo, n, rex->hi, env->end - s, s)),(0));
409 if ((rex->flags & REG_MINIMAL) && n >= rex->lo && n < rex->hi)
410 {
411 if (env->stack && pospush(env, rex, s, END_ANY))
412 return BAD;
413 i = follow(env, rex, cont, s);
414 if (env->stack)
415 pospop(env);
416 switch (i)
417 {
418 case BAD:
419 return BAD;
420 case CUT:
421 return CUT;
422 case BEST:
423 case GOOD:
424 return BEST;
425 }
426 }
427 if (n < rex->hi)
428 {
429 catcher.type = REX_REP_CATCH;
430 catcher.serial = rex->serial;
431 catcher.re.rep_catch.ref = rex;
432 catcher.re.rep_catch.cont = cont;
433 catcher.re.rep_catch.beg = s;
434 catcher.re.rep_catch.n = n + 1;
435 catcher.next = rex->next;
436 if (n == 0)
437 rex->re.rep_catch.beg = s;
438 if (env->stack)
439 {
440 if (matchpush(env, rex))
441 return BAD;
442 if (pospush(env, rex, s, BEG_ONE))
443 return BAD;
444 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x PUSH %d (%z,%z)(%z,%z)(%z,%z) (%z,%z)(%z,%z)(%z,%z)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo, env->match[2].rm_so, env->match[2].rm_eo)),(0));
445 }
446 r = parse(env, rex->re.group.expr.rex, &catcher, s);
447 DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep parse %d %d `%-.*s'\n", __LINE__, debug_flag, rex->re.group.number, r, env->end - s, s)),(0));
448 if (env->stack)
449 {
450 pospop(env);
451 matchpop(env, rex);
452 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x POP %d %d (%z,%z)(%z,%z)(%z,%z) (%z,%z)(%z,%z)(%z,%z)\n", __LINE__, debug_flag, rex->re.group.number, r, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo, env->match[2].rm_so, env->match[2].rm_eo)),(0));
453 }
454 switch (r)
455 {
456 case BAD:
457 return BAD;
458 case BEST:
459 return BEST;
460 case CUT:
461 r = NONE;
462 break;
463 case GOOD:
464 if (rex->flags & REG_MINIMAL)
465 return BEST;
466 r = GOOD;
467 break;
468 }
469 }
470 if (n < rex->lo)
471 return r;
472 if (!(rex->flags & REG_MINIMAL) || n >= rex->hi)
473 {
474 if (env->stack && pospush(env, rex, s, END_ANY))
475 return BAD;
476 i = follow(env, rex, cont, s);
477 if (env->stack)
478 pospop(env);
479 switch (i)
480 {
481 case BAD:
482 r = BAD;
483 break;
484 case CUT:
485 r = CUT;
486 break;
487 case BEST:
488 r = BEST;
489 break;
490 case GOOD:
491 r = (rex->flags & REG_MINIMAL) ? BEST : GOOD;
492 break;
493 }
494 }
495 return r;
496 }
497
498 static int
parsetrie(Env_t * env,Trie_node_t * x,Rex_t * rex,Rex_t * cont,unsigned char * s)499 parsetrie(Env_t* env, Trie_node_t* x, Rex_t* rex, Rex_t* cont, unsigned char* s)
500 {
501 unsigned char* p;
502 int r;
503
504 if (p = rex->map)
505 {
506 for (;;)
507 {
508 if (s >= env->end)
509 return NONE;
510 while (x->c != p[*s])
511 if (!(x = x->sib))
512 return NONE;
513 if (x->end)
514 break;
515 x = x->son;
516 s++;
517 }
518 }
519 else
520 {
521 for (;;)
522 {
523 if (s >= env->end)
524 return NONE;
525 while (x->c != *s)
526 if (!(x = x->sib))
527 return NONE;
528 if (x->end)
529 break;
530 x = x->son;
531 s++;
532 }
533 }
534 s++;
535 if (rex->flags & REG_MINIMAL)
536 switch (follow(env, rex, cont, s))
537 {
538 case BAD:
539 return BAD;
540 case CUT:
541 return CUT;
542 case BEST:
543 case GOOD:
544 return BEST;
545 }
546 if (x->son)
547 switch (parsetrie(env, x->son, rex, cont, s))
548 {
549 case BAD:
550 return BAD;
551 case CUT:
552 return CUT;
553 case BEST:
554 return BEST;
555 case GOOD:
556 if (rex->flags & REG_MINIMAL)
557 return BEST;
558 r = GOOD;
559 break;
560 default:
561 r = NONE;
562 break;
563 }
564 else
565 r = NONE;
566 if (!(rex->flags & REG_MINIMAL))
567 switch (follow(env, rex, cont, s))
568 {
569 case BAD:
570 return BAD;
571 case CUT:
572 return CUT;
573 case BEST:
574 return BEST;
575 case GOOD:
576 return GOOD;
577 }
578 return r;
579 }
580
581 static int
collelt(register Celt_t * ce,char * key,int c,int x)582 collelt(register Celt_t* ce, char* key, int c, int x)
583 {
584 Ckey_t elt;
585
586 mbxfrm(elt, key, COLL_KEY_MAX);
587 for (;; ce++)
588 {
589 switch (ce->typ)
590 {
591 case COLL_call:
592 if (!x && (*ce->fun)(c))
593 return 1;
594 continue;
595 case COLL_char:
596 if (!strcmp((char*)ce->beg, (char*)elt))
597 return 1;
598 continue;
599 case COLL_range:
600 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max)
601 return 1;
602 continue;
603 case COLL_range_lc:
604 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswlower(c) || !iswupper(c)))
605 return 1;
606 continue;
607 case COLL_range_uc:
608 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswupper(c) || !iswlower(c)))
609 return 1;
610 continue;
611 }
612 break;
613 }
614 return 0;
615 }
616
617 static int
collic(register Celt_t * ce,char * key,register char * nxt,int c,int x)618 collic(register Celt_t* ce, char* key, register char* nxt, int c, int x)
619 {
620 if (!x)
621 {
622 if (collelt(ce, key, c, x))
623 return 1;
624 if (iswlower(c))
625 c = towupper(c);
626 else if (iswupper(c))
627 c = towlower(c);
628 else
629 return 0;
630 x = mbconv(key, c);
631 key[x] = 0;
632 return collelt(ce, key, c, 0);
633 }
634 while (*nxt)
635 {
636 if (collic(ce, key, nxt + 1, c, x))
637 return 1;
638 if (islower(*nxt))
639 *nxt = toupper(*nxt);
640 else if (isupper(*nxt))
641 *nxt = tolower(*nxt);
642 else
643 return 0;
644 nxt++;
645 }
646 return collelt(ce, key, c, x);
647 }
648
649 static int
collmatch(Rex_t * rex,unsigned char * s,unsigned char * e,unsigned char ** p)650 collmatch(Rex_t* rex, unsigned char* s, unsigned char* e, unsigned char** p)
651 {
652 unsigned char* t;
653 wchar_t c;
654 int w;
655 int r;
656 int x;
657 int ic;
658 Ckey_t key;
659 Ckey_t elt;
660
661 ic = (rex->flags & REG_ICASE);
662 if ((w = MBSIZE(s)) > 1)
663 {
664 memcpy((char*)key, (char*)s, w);
665 key[w] = 0;
666 t = s;
667 c = mbchar(t);
668 #if !_lib_wctype
669 c &= 0xff;
670 #endif
671 x = 0;
672 }
673 else
674 {
675 c = s[0];
676 if (ic && isupper(c))
677 c = tolower(c);
678 key[0] = c;
679 key[1] = 0;
680 if (isalpha(c))
681 {
682 x = e - s;
683 if (x > COLL_KEY_MAX)
684 x = COLL_KEY_MAX;
685 while (w < x)
686 {
687 c = s[w];
688 if (!isalpha(c))
689 break;
690 r = mbxfrm(elt, key, COLL_KEY_MAX);
691 if (ic && isupper(c))
692 c = tolower(c);
693 key[w] = c;
694 key[w + 1] = 0;
695 if (mbxfrm(elt, key, COLL_KEY_MAX) != r)
696 break;
697 w++;
698 }
699 }
700 key[w] = 0;
701 c = key[0];
702 x = w - 1;
703 }
704 r = 1;
705 for (;;)
706 {
707 if (ic ? collic(rex->re.collate.elements, (char*)key, (char*)key, c, x) : collelt(rex->re.collate.elements, (char*)key, c, x))
708 break;
709 if (!x)
710 {
711 r = 0;
712 break;
713 }
714 w = x--;
715 key[w] = 0;
716 }
717 *p = s + w;
718 return rex->re.collate.invert ? !r : r;
719 }
720
721 static unsigned char*
nestmatch(register unsigned char * s,register unsigned char * e,const unsigned short * type,register int co)722 nestmatch(register unsigned char* s, register unsigned char* e, const unsigned short* type, register int co)
723 {
724 register int c;
725 register int cc;
726 unsigned int n;
727 int oc;
728
729 if (type[co] & (REX_NEST_literal|REX_NEST_quote))
730 {
731 n = (type[co] & REX_NEST_literal) ? REX_NEST_terminator : (REX_NEST_escape|REX_NEST_terminator);
732 while (s < e)
733 {
734 c = *s++;
735 if (c == co)
736 return s;
737 else if (type[c] & n)
738 {
739 if (s >= e || (type[c] & REX_NEST_terminator))
740 break;
741 s++;
742 }
743 }
744 }
745 else
746 {
747 cc = type[co] >> REX_NEST_SHIFT;
748 oc = type[co] & (REX_NEST_open|REX_NEST_close);
749 n = 1;
750 while (s < e)
751 {
752 c = *s++;
753 switch (type[c] & (REX_NEST_escape|REX_NEST_open|REX_NEST_close|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))
754 {
755 case REX_NEST_delimiter:
756 case REX_NEST_terminator:
757 return oc ? 0 : s;
758 case REX_NEST_separator:
759 if (!oc)
760 return s;
761 break;
762 case REX_NEST_escape:
763 if (s >= e)
764 return 0;
765 s++;
766 break;
767 case REX_NEST_open|REX_NEST_close:
768 if (c == cc)
769 {
770 if (!--n)
771 return s;
772 }
773 /*FALLTHROUGH*/
774 case REX_NEST_open:
775 if (c == co)
776 {
777 if (!++n)
778 return 0;
779 }
780 else if (!(s = nestmatch(s, e, type, c)))
781 return 0;
782 break;
783 case REX_NEST_close:
784 if (c != cc)
785 return 0;
786 if (!--n)
787 return s;
788 break;
789 }
790 }
791 return (oc || !(type[UCHAR_MAX+1] & REX_NEST_terminator)) ? 0 : s;
792 }
793 return 0;
794 }
795
796 static int
parse(Env_t * env,Rex_t * rex,Rex_t * cont,unsigned char * s)797 parse(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s)
798 {
799 int c;
800 int d;
801 int m;
802 int r;
803 ssize_t i;
804 ssize_t n;
805 int* f;
806 unsigned char* p;
807 unsigned char* t;
808 unsigned char* b;
809 unsigned char* e;
810 char* u;
811 regmatch_t* o;
812 Trie_node_t* x;
813 Rex_t* q;
814 Rex_t catcher;
815 Rex_t next;
816
817 for (;;)
818 {
819 DEBUG_TEST(0x0008,(sfprintf(sfstdout, "AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0));
820 switch (rex->type)
821 {
822 case REX_ALT:
823 if (env->stack)
824 {
825 if (matchpush(env, rex))
826 return BAD;
827 if (pospush(env, rex, s, BEG_ALT))
828 return BAD;
829 catcher.type = REX_ALT_CATCH;
830 catcher.serial = rex->serial;
831 catcher.re.alt_catch.cont = cont;
832 catcher.next = rex->next;
833 r = parse(env, rex->re.group.expr.binary.left, &catcher, s);
834 if (r < BEST || (rex->flags & REG_MINIMAL))
835 {
836 matchcopy(env, rex);
837 ((Pos_t*)env->pos->vec + env->pos->cur - 1)->serial = catcher.serial = rex->re.group.expr.binary.serial;
838 n = parse(env, rex->re.group.expr.binary.right, &catcher, s);
839 if (n != NONE)
840 r = n;
841 }
842 pospop(env);
843 matchpop(env, rex);
844 }
845 else
846 {
847 if ((r = parse(env, rex->re.group.expr.binary.left, cont, s)) == NONE)
848 r = parse(env, rex->re.group.expr.binary.right, cont, s);
849 if (r == GOOD)
850 r = BEST;
851 }
852 return r;
853 case REX_ALT_CATCH:
854 if (pospush(env, rex, s, END_ANY))
855 return BAD;
856 r = follow(env, rex, rex->re.alt_catch.cont, s);
857 pospop(env);
858 return r;
859 case REX_BACK:
860 o = &env->match[rex->lo];
861 if (o->rm_so < 0)
862 return NONE;
863 i = o->rm_eo - o->rm_so;
864 e = s + i;
865 if (e > env->end)
866 return NONE;
867 t = env->beg + o->rm_so;
868 if (!(p = rex->map))
869 {
870 while (s < e)
871 if (*s++ != *t++)
872 return NONE;
873 }
874 else if (!mbwide())
875 {
876 while (s < e)
877 if (p[*s++] != p[*t++])
878 return NONE;
879 }
880 else
881 {
882 while (s < e)
883 {
884 c = mbchar(s);
885 d = mbchar(t);
886 if (towupper(c) != towupper(d))
887 return NONE;
888 }
889 }
890 break;
891 case REX_BEG:
892 if ((!(rex->flags & REG_NEWLINE) || s <= env->beg || *(s - 1) != '\n') && ((env->flags & REG_NOTBOL) || s != env->beg))
893 return NONE;
894 break;
895 case REX_CLASS:
896 if (LEADING(env, rex, s))
897 return NONE;
898 n = rex->hi;
899 if (n > env->end - s)
900 n = env->end - s;
901 m = rex->lo;
902 if (m > n)
903 return NONE;
904 r = NONE;
905 if (!(rex->flags & REG_MINIMAL))
906 {
907 for (i = 0; i < n; i++)
908 if (!settst(rex->re.charclass, s[i]))
909 {
910 n = i;
911 break;
912 }
913 for (s += n; n-- >= m; s--)
914 switch (follow(env, rex, cont, s))
915 {
916 case BAD:
917 return BAD;
918 case CUT:
919 return CUT;
920 case BEST:
921 return BEST;
922 case GOOD:
923 r = GOOD;
924 break;
925 }
926 }
927 else
928 {
929 for (e = s + m; s < e; s++)
930 if (!settst(rex->re.charclass, *s))
931 return r;
932 e += n - m;
933 for (;;)
934 {
935 switch (follow(env, rex, cont, s))
936 {
937 case BAD:
938 return BAD;
939 case CUT:
940 return CUT;
941 case BEST:
942 case GOOD:
943 return BEST;
944 }
945 if (s >= e || !settst(rex->re.charclass, *s))
946 break;
947 s++;
948 }
949 }
950 return r;
951 case REX_COLL_CLASS:
952 if (LEADING(env, rex, s))
953 return NONE;
954 n = rex->hi;
955 if (n > env->end - s)
956 n = env->end - s;
957 m = rex->lo;
958 if (m > n)
959 return NONE;
960 r = NONE;
961 e = env->end;
962 if (!(rex->flags & REG_MINIMAL))
963 {
964 if (!(b = (unsigned char*)stkpush(stkstd, n)))
965 {
966 env->error = REG_ESPACE;
967 return BAD;
968 }
969 for (i = 0; s < e && i < n && collmatch(rex, s, e, &t); i++)
970 {
971 b[i] = t - s;
972 s = t;
973 }
974 for (; i-- >= rex->lo; s -= b[i])
975 switch (follow(env, rex, cont, s))
976 {
977 case BAD:
978 stkpop(stkstd);
979 return BAD;
980 case CUT:
981 stkpop(stkstd);
982 return CUT;
983 case BEST:
984 stkpop(stkstd);
985 return BEST;
986 case GOOD:
987 r = GOOD;
988 break;
989 }
990 stkpop(stkstd);
991 }
992 else
993 {
994 for (i = 0; i < m && s < e; i++, s = t)
995 if (!collmatch(rex, s, e, &t))
996 return r;
997 while (i++ <= n)
998 {
999 switch (follow(env, rex, cont, s))
1000 {
1001 case BAD:
1002 return BAD;
1003 case CUT:
1004 return CUT;
1005 case BEST:
1006 case GOOD:
1007 return BEST;
1008 }
1009 if (s >= e || !collmatch(rex, s, e, &s))
1010 break;
1011 }
1012 }
1013 return r;
1014 case REX_CONJ:
1015 next.type = REX_CONJ_RIGHT;
1016 next.re.conj_right.cont = cont;
1017 next.next = rex->next;
1018 catcher.type = REX_CONJ_LEFT;
1019 catcher.re.conj_left.right = rex->re.group.expr.binary.right;
1020 catcher.re.conj_left.cont = &next;
1021 catcher.re.conj_left.beg = s;
1022 catcher.next = 0;
1023 return parse(env, rex->re.group.expr.binary.left, &catcher, s);
1024 case REX_CONJ_LEFT:
1025 rex->re.conj_left.cont->re.conj_right.end = s;
1026 cont = rex->re.conj_left.cont;
1027 s = rex->re.conj_left.beg;
1028 rex = rex->re.conj_left.right;
1029 continue;
1030 case REX_CONJ_RIGHT:
1031 if (rex->re.conj_right.end != s)
1032 return NONE;
1033 cont = rex->re.conj_right.cont;
1034 break;
1035 case REX_DONE:
1036 if (!env->stack)
1037 return BEST;
1038 n = s - env->beg;
1039 r = env->nsub;
1040 DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%z,%z)(%z,%z)(%z,%z)(%z,%z) (%z,%z)(%z,%z)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0));
1041 if ((i = env->best[0].rm_eo) >= 0)
1042 {
1043 if (rex->flags & REG_MINIMAL)
1044 {
1045 if (n > i)
1046 return GOOD;
1047 }
1048 else
1049 {
1050 if (n < i)
1051 return GOOD;
1052 }
1053 if (n == i && better(env,
1054 (Pos_t*)env->bestpos->vec,
1055 (Pos_t*)env->pos->vec,
1056 (Pos_t*)env->bestpos->vec+env->bestpos->cur,
1057 (Pos_t*)env->pos->vec+env->pos->cur,
1058 0) <= 0)
1059 return GOOD;
1060 }
1061 env->best[0].rm_eo = n;
1062 memcpy(&env->best[1], &env->match[1], r * sizeof(regmatch_t));
1063 n = env->pos->cur;
1064 if (!vector(Pos_t, env->bestpos, n))
1065 {
1066 env->error = REG_ESPACE;
1067 return BAD;
1068 }
1069 env->bestpos->cur = n;
1070 memcpy(env->bestpos->vec, env->pos->vec, n * sizeof(Pos_t));
1071 DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%z,%z)(%z,%z)(%z,%z)(%z,%z) (%z,%z)(%z,%z)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0));
1072 return GOOD;
1073 case REX_DOT:
1074 if (LEADING(env, rex, s))
1075 return NONE;
1076 n = rex->hi;
1077 if (n > env->end - s)
1078 n = env->end - s;
1079 m = rex->lo;
1080 if (m > n)
1081 return NONE;
1082 if ((c = rex->explicit) >= 0 && !mbwide())
1083 for (i = 0; i < n; i++)
1084 if (s[i] == c)
1085 {
1086 n = i;
1087 break;
1088 }
1089 r = NONE;
1090 if (!(rex->flags & REG_MINIMAL))
1091 {
1092 if (!mbwide())
1093 {
1094 for (s += n; n-- >= m; s--)
1095 switch (follow(env, rex, cont, s))
1096 {
1097 case BAD:
1098 return BAD;
1099 case CUT:
1100 return CUT;
1101 case BEST:
1102 return BEST;
1103 case GOOD:
1104 r = GOOD;
1105 break;
1106 }
1107 }
1108 else
1109 {
1110 if (!(b = (unsigned char*)stkpush(stkstd, n)))
1111 {
1112 env->error = REG_ESPACE;
1113 return BAD;
1114 }
1115 e = env->end;
1116 for (i = 0; s < e && i < n && *s != c; i++)
1117 s += b[i] = MBSIZE(s);
1118 for (; i-- >= m; s -= b[i])
1119 switch (follow(env, rex, cont, s))
1120 {
1121 case BAD:
1122 stkpop(stkstd);
1123 return BAD;
1124 case CUT:
1125 stkpop(stkstd);
1126 return CUT;
1127 case BEST:
1128 stkpop(stkstd);
1129 return BEST;
1130 case GOOD:
1131 r = GOOD;
1132 break;
1133 }
1134 stkpop(stkstd);
1135 }
1136 }
1137 else
1138 {
1139 if (!mbwide())
1140 {
1141 e = s + n;
1142 for (s += m; s <= e; s++)
1143 switch (follow(env, rex, cont, s))
1144 {
1145 case BAD:
1146 return BAD;
1147 case CUT:
1148 return CUT;
1149 case BEST:
1150 case GOOD:
1151 return BEST;
1152 }
1153 }
1154 else
1155 {
1156 e = env->end;
1157 for (i = 0; s < e && i < m && *s != c; i++)
1158 s += MBSIZE(s);
1159 if (i >= m)
1160 for (; s <= e && i <= n; s += MBSIZE(s), i++)
1161 switch (follow(env, rex, cont, s))
1162 {
1163 case BAD:
1164 return BAD;
1165 case CUT:
1166 return CUT;
1167 case BEST:
1168 case GOOD:
1169 return BEST;
1170 }
1171 }
1172 }
1173 return r;
1174 case REX_END:
1175 if ((!(rex->flags & REG_NEWLINE) || *s != '\n') && ((env->flags & REG_NOTEOL) || s < env->end))
1176 return NONE;
1177 break;
1178 case REX_GROUP:
1179 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0));
1180 if (env->stack)
1181 {
1182 if (rex->re.group.number)
1183 env->match[rex->re.group.number].rm_so = s - env->beg;
1184 if (pospush(env, rex, s, BEG_SUB))
1185 return BAD;
1186 catcher.re.group_catch.eo = rex->re.group.number ? &env->match[rex->re.group.number].rm_eo : (regoff_t*)0;
1187 }
1188 catcher.type = REX_GROUP_CATCH;
1189 catcher.serial = rex->serial;
1190 catcher.re.group_catch.cont = cont;
1191 catcher.next = rex->next;
1192 r = parse(env, rex->re.group.expr.rex, &catcher, s);
1193 if (env->stack)
1194 {
1195 pospop(env);
1196 if (rex->re.group.number)
1197 env->match[rex->re.group.number].rm_so = -1;
1198 }
1199 return r;
1200 case REX_GROUP_CATCH:
1201 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s=>%s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rexname(rex->re.group_catch.cont), env->end - s, s)),(0));
1202 if (env->stack)
1203 {
1204 if (rex->re.group_catch.eo)
1205 *rex->re.group_catch.eo = s - env->beg;
1206 if (pospush(env, rex, s, END_ANY))
1207 return BAD;
1208 }
1209 r = follow(env, rex, rex->re.group_catch.cont, s);
1210 if (env->stack)
1211 {
1212 pospop(env);
1213 if (rex->re.group_catch.eo)
1214 *rex->re.group_catch.eo = -1;
1215 }
1216 return r;
1217 case REX_GROUP_AHEAD:
1218 catcher.type = REX_GROUP_AHEAD_CATCH;
1219 catcher.flags = rex->flags;
1220 catcher.serial = rex->serial;
1221 catcher.re.rep_catch.beg = s;
1222 catcher.re.rep_catch.cont = cont;
1223 catcher.next = rex->next;
1224 return parse(env, rex->re.group.expr.rex, &catcher, s);
1225 case REX_GROUP_AHEAD_CATCH:
1226 return follow(env, rex, rex->re.rep_catch.cont, rex->re.rep_catch.beg);
1227 case REX_GROUP_AHEAD_NOT:
1228 r = parse(env, rex->re.group.expr.rex, NiL, s);
1229 if (r == NONE)
1230 r = follow(env, rex, cont, s);
1231 else if (r != BAD)
1232 r = NONE;
1233 return r;
1234 case REX_GROUP_BEHIND:
1235 if ((s - env->beg) < rex->re.group.size)
1236 return NONE;
1237 catcher.type = REX_GROUP_BEHIND_CATCH;
1238 catcher.flags = rex->flags;
1239 catcher.serial = rex->serial;
1240 catcher.re.behind_catch.beg = s;
1241 catcher.re.behind_catch.end = e = env->end;
1242 catcher.re.behind_catch.cont = cont;
1243 catcher.next = rex->next;
1244 for (t = s - rex->re.group.size; t >= env->beg; t--)
1245 {
1246 env->end = s;
1247 r = parse(env, rex->re.group.expr.rex, &catcher, t);
1248 env->end = e;
1249 if (r != NONE)
1250 return r;
1251 }
1252 return NONE;
1253 case REX_GROUP_BEHIND_CATCH:
1254 if (s != rex->re.behind_catch.beg)
1255 return NONE;
1256 env->end = rex->re.behind_catch.end;
1257 return follow(env, rex, rex->re.behind_catch.cont, rex->re.behind_catch.beg);
1258 case REX_GROUP_BEHIND_NOT:
1259 if ((s - env->beg) < rex->re.group.size)
1260 r = NONE;
1261 else
1262 {
1263 catcher.type = REX_GROUP_BEHIND_NOT_CATCH;
1264 catcher.re.neg_catch.beg = s;
1265 catcher.next = 0;
1266 e = env->end;
1267 env->end = s;
1268 for (t = s - rex->re.group.size; t >= env->beg; t--)
1269 {
1270 r = parse(env, rex->re.group.expr.rex, &catcher, t);
1271 if (r != NONE)
1272 break;
1273 }
1274 env->end = e;
1275 }
1276 if (r == NONE)
1277 r = follow(env, rex, cont, s);
1278 else if (r != BAD)
1279 r = NONE;
1280 return r;
1281 case REX_GROUP_BEHIND_NOT_CATCH:
1282 return s == rex->re.neg_catch.beg ? GOOD : NONE;
1283 case REX_GROUP_COND:
1284 if (q = rex->re.group.expr.binary.right)
1285 {
1286 catcher.re.cond_catch.next[0] = q->re.group.expr.binary.right;
1287 catcher.re.cond_catch.next[1] = q->re.group.expr.binary.left;
1288 }
1289 else
1290 catcher.re.cond_catch.next[0] = catcher.re.cond_catch.next[1] = 0;
1291 if (q = rex->re.group.expr.binary.left)
1292 {
1293 catcher.type = REX_GROUP_COND_CATCH;
1294 catcher.flags = rex->flags;
1295 catcher.serial = rex->serial;
1296 catcher.re.cond_catch.yes = 0;
1297 catcher.re.cond_catch.beg = s;
1298 catcher.re.cond_catch.cont = cont;
1299 catcher.next = rex->next;
1300 r = parse(env, q, &catcher, s);
1301 if (r == BAD || catcher.re.cond_catch.yes)
1302 return r;
1303 }
1304 else if (!rex->re.group.size || rex->re.group.size > 0 && env->match[rex->re.group.size].rm_so >= 0)
1305 r = GOOD;
1306 else
1307 r = NONE;
1308 if (q = catcher.re.cond_catch.next[r != NONE])
1309 {
1310 catcher.type = REX_CAT;
1311 catcher.flags = q->flags;
1312 catcher.serial = q->serial;
1313 catcher.re.group_catch.cont = cont;
1314 catcher.next = rex->next;
1315 return parse(env, q, &catcher, s);
1316 }
1317 return follow(env, rex, cont, s);
1318 case REX_GROUP_COND_CATCH:
1319 rex->re.cond_catch.yes = 1;
1320 catcher.type = REX_CAT;
1321 catcher.flags = rex->flags;
1322 catcher.serial = rex->serial;
1323 catcher.re.group_catch.cont = rex->re.cond_catch.cont;
1324 catcher.next = rex->next;
1325 return parse(env, rex->re.cond_catch.next[1], &catcher, rex->re.cond_catch.beg);
1326 case REX_CAT:
1327 return follow(env, rex, rex->re.group_catch.cont, s);
1328 case REX_GROUP_CUT:
1329 catcher.type = REX_GROUP_CUT_CATCH;
1330 catcher.flags = rex->flags;
1331 catcher.serial = rex->serial;
1332 catcher.re.group_catch.cont = cont;
1333 catcher.next = rex->next;
1334 return parse(env, rex->re.group.expr.rex, &catcher, s);
1335 case REX_GROUP_CUT_CATCH:
1336 switch (r = follow(env, rex, rex->re.group_catch.cont, s))
1337 {
1338 case GOOD:
1339 r = BEST;
1340 break;
1341 case NONE:
1342 r = CUT;
1343 break;
1344 }
1345 return r;
1346 case REX_KMP:
1347 f = rex->re.string.fail;
1348 b = rex->re.string.base;
1349 n = rex->re.string.size;
1350 t = s;
1351 e = env->end;
1352 if (p = rex->map)
1353 {
1354 while (t + n <= e)
1355 {
1356 for (i = -1; t < e; t++)
1357 {
1358 while (i >= 0 && b[i+1] != p[*t])
1359 i = f[i];
1360 if (b[i+1] == p[*t])
1361 i++;
1362 if (i + 1 == n)
1363 {
1364 t++;
1365 if (env->stack)
1366 env->best[0].rm_so = t - s - n;
1367 switch (follow(env, rex, cont, t))
1368 {
1369 case BAD:
1370 return BAD;
1371 case CUT:
1372 return CUT;
1373 case BEST:
1374 case GOOD:
1375 return BEST;
1376 }
1377 t -= n - 1;
1378 break;
1379 }
1380 }
1381 }
1382 }
1383 else
1384 {
1385 while (t + n <= e)
1386 {
1387 for (i = -1; t < e; t++)
1388 {
1389 while (i >= 0 && b[i+1] != *t)
1390 i = f[i];
1391 if (b[i+1] == *t)
1392 i++;
1393 if (i + 1 == n)
1394 {
1395 t++;
1396 if (env->stack)
1397 env->best[0].rm_so = t - s - n;
1398 switch (follow(env, rex, cont, t))
1399 {
1400 case BAD:
1401 return BAD;
1402 case CUT:
1403 return CUT;
1404 case BEST:
1405 case GOOD:
1406 return BEST;
1407 }
1408 t -= n - 1;
1409 break;
1410 }
1411 }
1412 }
1413 }
1414 return NONE;
1415 case REX_NEG:
1416 if (LEADING(env, rex, s))
1417 return NONE;
1418 i = env->end - s;
1419 n = ((i + 7) >> 3) + 1;
1420 catcher.type = REX_NEG_CATCH;
1421 catcher.re.neg_catch.beg = s;
1422 if (!(p = (unsigned char*)stkpush(stkstd, n)))
1423 return BAD;
1424 memset(catcher.re.neg_catch.index = p, 0, n);
1425 catcher.next = rex->next;
1426 if (parse(env, rex->re.group.expr.rex, &catcher, s) == BAD)
1427 r = BAD;
1428 else
1429 {
1430 r = NONE;
1431 for (; i >= 0; i--)
1432 if (!bittst(p, i))
1433 {
1434 switch (follow(env, rex, cont, s + i))
1435 {
1436 case BAD:
1437 r = BAD;
1438 break;
1439 case BEST:
1440 r = BEST;
1441 break;
1442 case CUT:
1443 r = CUT;
1444 break;
1445 case GOOD:
1446 r = GOOD;
1447 /*FALLTHROUGH*/
1448 default:
1449 continue;
1450 }
1451 break;
1452 }
1453 }
1454 stkpop(stkstd);
1455 return r;
1456 case REX_NEG_CATCH:
1457 bitset(rex->re.neg_catch.index, s - rex->re.neg_catch.beg);
1458 return NONE;
1459 case REX_NEST:
1460 if (s >= env->end)
1461 return NONE;
1462 do
1463 {
1464 if ((c = *s++) == rex->re.nest.primary)
1465 {
1466 if (s >= env->end || !(s = nestmatch(s, env->end, rex->re.nest.type, c)))
1467 return NONE;
1468 break;
1469 }
1470 if (rex->re.nest.primary >= 0)
1471 return NONE;
1472 if (rex->re.nest.type[c] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))
1473 break;
1474 if (!(s = nestmatch(s, env->end, rex->re.nest.type, c)))
1475 return NONE;
1476 } while (s < env->end && !(rex->re.nest.type[*(s-1)] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)));
1477 break;
1478 case REX_NULL:
1479 break;
1480 case REX_ONECHAR:
1481 n = rex->hi;
1482 if (n > env->end - s)
1483 n = env->end - s;
1484 m = rex->lo;
1485 if (m > n)
1486 return NONE;
1487 r = NONE;
1488 c = rex->re.onechar;
1489 if (!(rex->flags & REG_MINIMAL))
1490 {
1491 if (!mbwide())
1492 {
1493 if (p = rex->map)
1494 {
1495 for (i = 0; i < n; i++, s++)
1496 if (p[*s] != c)
1497 break;
1498 }
1499 else
1500 {
1501 for (i = 0; i < n; i++, s++)
1502 if (*s != c)
1503 break;
1504 }
1505 for (; i-- >= m; s--)
1506 switch (follow(env, rex, cont, s))
1507 {
1508 case BAD:
1509 return BAD;
1510 case BEST:
1511 return BEST;
1512 case CUT:
1513 return CUT;
1514 case GOOD:
1515 r = GOOD;
1516 break;
1517 }
1518 }
1519 else
1520 {
1521 if (!(b = (unsigned char*)stkpush(stkstd, n)))
1522 {
1523 env->error = REG_ESPACE;
1524 return BAD;
1525 }
1526 e = env->end;
1527 if (!(rex->flags & REG_ICASE))
1528 {
1529 for (i = 0; s < e && i < n; i++, s = t)
1530 {
1531 t = s;
1532 if (mbchar(t) != c)
1533 break;
1534 b[i] = t - s;
1535 }
1536 }
1537 else
1538 {
1539 for (i = 0; s < e && i < n; i++, s = t)
1540 {
1541 t = s;
1542 if (towupper(mbchar(t)) != c)
1543 break;
1544 b[i] = t - s;
1545 }
1546 }
1547 for (; i-- >= m; s -= b[i])
1548 switch (follow(env, rex, cont, s))
1549 {
1550 case BAD:
1551 stkpop(stkstd);
1552 return BAD;
1553 case BEST:
1554 stkpop(stkstd);
1555 return BEST;
1556 case CUT:
1557 stkpop(stkstd);
1558 return CUT;
1559 case GOOD:
1560 r = GOOD;
1561 break;
1562 }
1563 stkpop(stkstd);
1564 }
1565 }
1566 else
1567 {
1568 if (!mbwide())
1569 {
1570 e = s + m;
1571 if (p = rex->map)
1572 {
1573 for (; s < e; s++)
1574 if (p[*s] != c)
1575 return r;
1576 e += n - m;
1577 for (;;)
1578 {
1579 switch (follow(env, rex, cont, s))
1580 {
1581 case BAD:
1582 return BAD;
1583 case CUT:
1584 return CUT;
1585 case BEST:
1586 case GOOD:
1587 return BEST;
1588 }
1589 if (s >= e || p[*s++] != c)
1590 break;
1591 }
1592 }
1593 else
1594 {
1595 for (; s < e; s++)
1596 if (*s != c)
1597 return r;
1598 e += n - m;
1599 for (;;)
1600 {
1601 switch (follow(env, rex, cont, s))
1602 {
1603 case BAD:
1604 return BAD;
1605 case CUT:
1606 return CUT;
1607 case BEST:
1608 case GOOD:
1609 return BEST;
1610 }
1611 if (s >= e || *s++ != c)
1612 break;
1613 }
1614 }
1615 }
1616 else
1617 {
1618 e = env->end;
1619 if (!(rex->flags & REG_ICASE))
1620 {
1621 for (i = 0; i < m && s < e; i++, s = t)
1622 {
1623 t = s;
1624 if (mbchar(t) != c)
1625 return r;
1626 }
1627 while (i++ <= n)
1628 {
1629 switch (follow(env, rex, cont, s))
1630 {
1631 case BAD:
1632 return BAD;
1633 case CUT:
1634 return CUT;
1635 case BEST:
1636 case GOOD:
1637 return BEST;
1638 }
1639 if (s >= e || mbchar(s) != c)
1640 break;
1641 }
1642 }
1643 else
1644 {
1645 for (i = 0; i < m && s < e; i++, s = t)
1646 {
1647 t = s;
1648 if (towupper(mbchar(t)) != c)
1649 return r;
1650 }
1651 while (i++ <= n)
1652 {
1653 switch (follow(env, rex, cont, s))
1654 {
1655 case BAD:
1656 return BAD;
1657 case CUT:
1658 return CUT;
1659 case BEST:
1660 case GOOD:
1661 return BEST;
1662 }
1663 if (s >= e || towupper(mbchar(s)) != c)
1664 break;
1665 }
1666 }
1667 }
1668 }
1669 return r;
1670 case REX_REP:
1671 if (env->stack && pospush(env, rex, s, BEG_REP))
1672 return BAD;
1673 r = parserep(env, rex, cont, s, 0);
1674 if (env->stack)
1675 pospop(env);
1676 return r;
1677 case REX_REP_CATCH:
1678 DEBUG_TEST(0x0020,(sfprintf(sfstdout, "AHA#%04d 0x%04x %s n %d len %d s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.rep_catch.n, s - rex->re.rep_catch.beg, env->end - s, s)),(0));
1679 if (env->stack && pospush(env, rex, s, END_ANY))
1680 return BAD;
1681 if (s == rex->re.rep_catch.beg && rex->re.rep_catch.n > rex->re.rep_catch.ref->lo)
1682 {
1683 /*
1684 * optional empty iteration
1685 */
1686
1687 DEBUG_TEST(0x0002,(sfprintf(sfstdout, "AHA#%04d %p re.group.back=%d re.group.expr.rex=%s\n", __LINE__, rex->re.rep_catch.ref->re.group.expr.rex, rex->re.rep_catch.ref->re.group.expr.rex->re.group.back, rexname(rex->re.rep_catch.ref->re.group.expr.rex))),(0));
1688 if (!env->stack || s != rex->re.rep_catch.ref->re.rep_catch.beg && !rex->re.rep_catch.ref->re.group.expr.rex->re.group.back)
1689 r = NONE;
1690 else if (pospush(env, rex, s, END_ANY))
1691 r = BAD;
1692 else
1693 {
1694 r = follow(env, rex, rex->re.rep_catch.cont, s);
1695 pospop(env);
1696 }
1697 }
1698 else
1699 r = parserep(env, rex->re.rep_catch.ref, rex->re.rep_catch.cont, s, rex->re.rep_catch.n);
1700 if (env->stack)
1701 pospop(env);
1702 return r;
1703 case REX_STRING:
1704 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s \"%-.*s\" `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.string.size, rex->re.string.base, env->end - s, s)),(0));
1705 if (rex->re.string.size > (env->end - s))
1706 return NONE;
1707 t = rex->re.string.base;
1708 e = t + rex->re.string.size;
1709 if (!(p = rex->map))
1710 {
1711 while (t < e)
1712 if (*s++ != *t++)
1713 return NONE;
1714 }
1715 else if (!mbwide())
1716 {
1717 while (t < e)
1718 if (p[*s++] != *t++)
1719 return NONE;
1720 }
1721 else
1722 {
1723 while (t < e)
1724 {
1725 c = mbchar(s);
1726 d = mbchar(t);
1727 if (towupper(c) != d)
1728 return NONE;
1729 }
1730 }
1731 break;
1732 case REX_TRIE:
1733 if (((s + rex->re.trie.min) > env->end) || !(x = rex->re.trie.root[rex->map ? rex->map[*s] : *s]))
1734 return NONE;
1735 return parsetrie(env, x, rex, cont, s);
1736 case REX_EXEC:
1737 u = 0;
1738 r = (*env->disc->re_execf)(env->regex, rex->re.exec.data, rex->re.exec.text, rex->re.exec.size, (const char*)s, env->end - s, &u, env->disc);
1739 e = (unsigned char*)u;
1740 if (e >= s && e <= env->end)
1741 s = e;
1742 switch (r)
1743 {
1744 case 0:
1745 break;
1746 case REG_NOMATCH:
1747 return NONE;
1748 default:
1749 env->error = r;
1750 return BAD;
1751 }
1752 break;
1753 case REX_WBEG:
1754 if (!isword(*s) || s > env->beg && isword(*(s - 1)))
1755 return NONE;
1756 break;
1757 case REX_WEND:
1758 if (isword(*s) || s > env->beg && !isword(*(s - 1)))
1759 return NONE;
1760 break;
1761 case REX_WORD:
1762 if (s > env->beg && isword(*(s - 1)) == isword(*s))
1763 return NONE;
1764 break;
1765 case REX_WORD_NOT:
1766 if (s == env->beg || isword(*(s - 1)) != isword(*s))
1767 return NONE;
1768 break;
1769 case REX_BEG_STR:
1770 if (s != env->beg)
1771 return NONE;
1772 break;
1773 case REX_END_STR:
1774 for (t = s; t < env->end && *t == '\n'; t++);
1775 if (t < env->end)
1776 return NONE;
1777 break;
1778 case REX_FIN_STR:
1779 if (s < env->end)
1780 return NONE;
1781 break;
1782 }
1783 if (!(rex = rex->next))
1784 {
1785 if (!(rex = cont))
1786 break;
1787 cont = 0;
1788 }
1789 }
1790 return GOOD;
1791 }
1792
1793 #if _AST_REGEX_DEBUG
1794
1795 static void
listnode(Rex_t * e,int level)1796 listnode(Rex_t* e, int level)
1797 {
1798 int i;
1799
1800 if (e)
1801 {
1802 do
1803 {
1804 for (i = 0; i < level; i++)
1805 sfprintf(sfstderr, " ");
1806 sfprintf(sfstderr, "%s\n", rexname(e));
1807 switch (e->type)
1808 {
1809 case REX_ALT:
1810 case REX_CONJ:
1811 listnode(e->re.group.expr.binary.left, level + 1);
1812 listnode(e->re.group.expr.binary.right, level + 1);
1813 break;
1814 case REX_GROUP:
1815 case REX_GROUP_AHEAD:
1816 case REX_GROUP_AHEAD_NOT:
1817 case REX_GROUP_BEHIND:
1818 case REX_GROUP_BEHIND_NOT:
1819 case REX_GROUP_CUT:
1820 case REX_NEG:
1821 case REX_REP:
1822 listnode(e->re.group.expr.rex, level + 1);
1823 break;
1824 }
1825 } while (e = e->next);
1826 }
1827 }
1828
1829 static int
list(Env_t * env,Rex_t * rex)1830 list(Env_t* env, Rex_t* rex)
1831 {
1832 sfprintf(sfstderr, "AHA regex hard=%d stack=%p\n", env->hard, env->stack);
1833 if (rex)
1834 listnode(rex, 1);
1835 return 0;
1836 }
1837
1838 #endif
1839
1840 /*
1841 * returning REG_BADPAT or REG_ESPACE is not explicitly
1842 * countenanced by the standard
1843 */
1844
1845 int
regnexec(const regex_t * p,const char * s,size_t len,size_t nmatch,regmatch_t * match,regflags_t flags)1846 regnexec(const regex_t* p, const char* s, size_t len, size_t nmatch, regmatch_t* match, regflags_t flags)
1847 {
1848 register ssize_t n;
1849 register int i;
1850 int j;
1851 int k;
1852 int m;
1853 int advance;
1854 Env_t* env;
1855 Rex_t* e;
1856
1857 DEBUG_INIT();
1858 DEBUG_TEST(0x0001,(sfprintf(sfstdout, "AHA#%04d 0x%04x regnexec %d 0x%08x `%-.*s'\n", __LINE__, debug_flag, nmatch, flags, len, s)),(0));
1859 if (!p || !(env = p->env))
1860 return REG_BADPAT;
1861 if (!s)
1862 return fatal(env->disc, REG_BADPAT, NiL);
1863 if (len < env->min)
1864 {
1865 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, env->min)),(0));
1866 return REG_NOMATCH;
1867 }
1868 env->regex = p;
1869 env->beg = (unsigned char*)s;
1870 env->end = env->beg + len;
1871 stknew(stkstd, &env->stk);
1872 env->flags &= ~REG_EXEC;
1873 env->flags |= (flags & REG_EXEC);
1874 advance = 0;
1875 if (env->stack = env->hard || !(env->flags & REG_NOSUB) && nmatch)
1876 {
1877 n = env->nsub;
1878 if (!(env->match = (regmatch_t*)stkpush(stkstd, 2 * (n + 1) * sizeof(regmatch_t))) ||
1879 !env->pos && !(env->pos = vecopen(16, sizeof(Pos_t))) ||
1880 !env->bestpos && !(env->bestpos = vecopen(16, sizeof(Pos_t))))
1881 {
1882 k = REG_ESPACE;
1883 goto done;
1884 }
1885 env->pos->cur = env->bestpos->cur = 0;
1886 env->best = &env->match[n + 1];
1887 env->best[0].rm_so = 0;
1888 env->best[0].rm_eo = -1;
1889 for (i = 0; i <= n; i++)
1890 env->match[i] = state.nomatch;
1891 if (flags & REG_ADVANCE)
1892 advance = 1;
1893 }
1894 DEBUG_TEST(0x1000,(list(env,env->rex)),(0));
1895 k = REG_NOMATCH;
1896 if ((e = env->rex)->type == REX_BM)
1897 {
1898 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM\n", __LINE__)),(0));
1899 if (len < e->re.bm.right)
1900 {
1901 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, e->re.bm.right)),(0));
1902 goto done;
1903 }
1904 else if (!(flags & REG_LEFT))
1905 {
1906 register unsigned char* buf = (unsigned char*)s;
1907 register size_t index = e->re.bm.left + e->re.bm.size;
1908 register size_t mid = len - e->re.bm.right;
1909 register size_t* skip = e->re.bm.skip;
1910 register size_t* fail = e->re.bm.fail;
1911 register Bm_mask_t** mask = e->re.bm.mask;
1912 Bm_mask_t m;
1913 size_t x;
1914
1915 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM len=%d right=%d left=%d size=%d %d %d\n", __LINE__, len, e->re.bm.right, e->re.bm.left, e->re.bm.size, index, mid)),(0));
1916 for (;;)
1917 {
1918 while ((index += skip[buf[index]]) < mid);
1919 if (index < HIT)
1920 {
1921 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, index, HIT)),(0));
1922 goto done;
1923 }
1924 index -= HIT;
1925 m = mask[n = e->re.bm.size - 1][buf[index]];
1926 do
1927 {
1928 if (!n--)
1929 {
1930 if (e->re.bm.back < 0)
1931 goto possible;
1932 if (advance)
1933 {
1934 i = index - e->re.bm.back;
1935 s += i;
1936 if (env->stack)
1937 env->best[0].rm_so += i;
1938 goto possible;
1939 }
1940 x = index;
1941 if (index < e->re.bm.back)
1942 index = 0;
1943 else
1944 index -= e->re.bm.back;
1945 while (index <= x)
1946 {
1947 if ((i = parse(env, e->next, &env->done, buf + index)) != NONE)
1948 {
1949 if (env->stack)
1950 env->best[0].rm_so = index;
1951 n = env->nsub;
1952 goto hit;
1953 }
1954 index++;
1955 }
1956 index += e->re.bm.size;
1957 break;
1958 }
1959 } while (m &= mask[n][buf[--index]]);
1960 if ((index += fail[n + 1]) >= len)
1961 goto done;
1962 }
1963 }
1964 possible:
1965 n = env->nsub;
1966 e = e->next;
1967 }
1968 j = env->once || (flags & REG_LEFT);
1969 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d parse once=%d\n", __LINE__, j)),(0));
1970 while ((i = parse(env, e, &env->done, (unsigned char*)s)) == NONE || advance && !env->best[0].rm_eo && !(advance = 0))
1971 {
1972 if (j)
1973 goto done;
1974 i = MBSIZE(s);
1975 s += i;
1976 if ((unsigned char*)s > env->end - env->min)
1977 goto done;
1978 if (env->stack)
1979 env->best[0].rm_so += i;
1980 }
1981 if ((flags & REG_LEFT) && env->stack && env->best[0].rm_so)
1982 goto done;
1983 hit:
1984 if (k = env->error)
1985 goto done;
1986 if (i == CUT)
1987 {
1988 k = env->error = REG_NOMATCH;
1989 goto done;
1990 }
1991 if (!(env->flags & REG_NOSUB))
1992 {
1993 k = (env->flags & (REG_SHELL|REG_AUGMENTED)) == (REG_SHELL|REG_AUGMENTED);
1994 for (i = j = m = 0; j < nmatch; i++)
1995 if (!i || !k || (i & 1))
1996 {
1997 if (i > n)
1998 match[j] = state.nomatch;
1999 else
2000 match[m = j] = env->best[i];
2001 j++;
2002 }
2003 if (k)
2004 {
2005 while (m > 0 && match[m].rm_so == -1 && match[m].rm_eo == -1)
2006 m--;
2007 ((regex_t*)p)->re_nsub = m;
2008 }
2009 }
2010 k = 0;
2011 done:
2012 stkold(stkstd, &env->stk);
2013 env->stk.base = 0;
2014 if (k > REG_NOMATCH)
2015 fatal(p->env->disc, k, NiL);
2016 return k;
2017 }
2018
2019 void
regfree(regex_t * p)2020 regfree(regex_t* p)
2021 {
2022 Env_t* env;
2023
2024 if (p && (env = p->env))
2025 {
2026 #if _REG_subcomp
2027 if (env->sub)
2028 {
2029 regsubfree(p);
2030 p->re_sub = 0;
2031 }
2032 #endif
2033 p->env = 0;
2034 if (--env->refs <= 0 && !(env->disc->re_flags & REG_NOFREE))
2035 {
2036 drop(env->disc, env->rex);
2037 if (env->pos)
2038 vecclose(env->pos);
2039 if (env->bestpos)
2040 vecclose(env->bestpos);
2041 if (env->stk.base)
2042 stkold(stkstd, &env->stk);
2043 alloc(env->disc, env, 0);
2044 }
2045 }
2046 }
2047
2048 /*
2049 * 20120528: regoff_t changed from int to ssize_t
2050 */
2051
2052 #if defined(__EXPORT__)
2053 #define extern __EXPORT__
2054 #endif
2055
2056 #undef regnexec
2057 #if _map_libc
2058 #define regnexec _ast_regnexec
2059 #endif
2060
2061 extern int
regnexec(const regex_t * p,const char * s,size_t len,size_t nmatch,oldregmatch_t * oldmatch,regflags_t flags)2062 regnexec(const regex_t* p, const char* s, size_t len, size_t nmatch, oldregmatch_t* oldmatch, regflags_t flags)
2063 {
2064 if (oldmatch)
2065 {
2066 regmatch_t* match;
2067 ssize_t i;
2068 int r;
2069
2070 if (!(match = oldof(0, regmatch_t, nmatch, 0)))
2071 return -1;
2072 if (!(r = regnexec_20120528(p, s, len, nmatch, match, flags)))
2073 for (i = 0; i < nmatch; i++)
2074 {
2075 oldmatch[i].rm_so = match[i].rm_so;
2076 oldmatch[i].rm_eo = match[i].rm_eo;
2077 }
2078 free(match);
2079 return r;
2080 }
2081 return regnexec_20120528(p, s, len, 0, NiL, flags);
2082 }
2083