1 /*
2 * Copyright 2023 Bill Sommerfeld <sommerfeld@hamachi.org>
3 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
4 * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
5 * Copyright 2012 Milan Jurik. All rights reserved.
6 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7 * Copyright (c) 1992, 1993, 1994
8 * The Regents of the University of California. All rights reserved.
9 *
10 * This code is derived from software contributed to Berkeley by
11 * Henry Spencer.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38 #include "lint.h"
39 #include "file64.h"
40 #include <sys/types.h>
41 #include <stdio.h>
42 #include <string.h>
43 #include <ctype.h>
44 #include <limits.h>
45 #include <regex.h>
46 #include <stdlib.h>
47 #include <stdbool.h>
48 #include <wchar.h>
49 #include <wctype.h>
50
51 #include "../locale/runetype.h"
52 #include "../locale/collate.h"
53
54 #include "utils.h"
55 #include "regex2.h"
56
57 #include "cname.h"
58 #include "../locale/mblocal.h"
59
60 /*
61 * Branching context, used to keep track of branch state for all of the branch-
62 * aware functions. In addition to keeping track of branch positions for the
63 * p_branch_* functions, we use this to simplify some clumsiness in BREs for
64 * detection of whether ^ is acting as an anchor or being used erroneously and
65 * also for whether we're in a sub-expression or not.
66 */
67 struct branchc {
68 sopno start;
69 sopno back;
70 sopno fwd;
71
72 int nbranch;
73 int nchain;
74 bool outer;
75 bool terminate;
76 };
77
78 /*
79 * parse structure, passed up and down to avoid global variables and
80 * other clumsinesses
81 */
82 struct parse {
83 const char *next; /* next character in RE */
84 const char *end; /* end of string (-> NUL normally) */
85 int error; /* has an error been seen? */
86 sop *strip; /* malloced strip */
87 sopno ssize; /* malloced strip size (allocated) */
88 sopno slen; /* malloced strip length (used) */
89 int ncsalloc; /* number of csets allocated */
90 wint_t nc; /* size of small-char bitmap in cset */
91 struct re_guts *g;
92 #define NPAREN 10 /* we need to remember () 1-9 for back refs */
93 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
94 sopno pend[NPAREN]; /* -> ) ([0] unused) */
95 bool allowbranch; /* can this expression branch? */
96 bool bre; /* convenience; is this a BRE? */
97 bool (*parse_expr)(struct parse *, struct branchc *);
98 void (*pre_parse)(struct parse *, struct branchc *);
99 void (*post_parse)(struct parse *, struct branchc *);
100 };
101
102 /* ========= begin header generated by ./mkh ========= */
103 #ifdef __cplusplus
104 extern "C" {
105 #endif
106
107 /* === regcomp.c === */
108 static bool p_ere_exp(struct parse *p, struct branchc *bc);
109 static void p_str(struct parse *p);
110 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
111 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
112 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
113 static bool p_branch_empty(struct parse *p, struct branchc *bc);
114 static bool p_branch_do(struct parse *p, struct branchc *bc);
115 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
116 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
117 static void p_re(struct parse *p, int end1, int end2);
118 static bool p_simp_re(struct parse *p, struct branchc *bc);
119 static int p_count(struct parse *p);
120 static void p_bracket(struct parse *p);
121 static void p_b_term(struct parse *p, cset *cs);
122 static void p_b_cclass(struct parse *p, cset *cs);
123 static void p_b_eclass(struct parse *p, cset *cs);
124 static wint_t p_b_symbol(struct parse *p);
125 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
126 static wint_t othercase(wint_t ch);
127 static void bothcases(struct parse *p, wint_t ch);
128 static void ordinary(struct parse *p, wint_t ch);
129 static void nonnewline(struct parse *p);
130 static void repeat(struct parse *p, sopno start, int from, int to);
131 static int seterr(struct parse *p, int e);
132 static cset *allocset(struct parse *p);
133 static void freeset(struct parse *p, cset *cs);
134 static void CHadd(struct parse *p, cset *cs, wint_t ch);
135 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
136 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
137 static wint_t singleton(struct parse *p, cset *cs);
138 static sopno dupl(struct parse *p, sopno start, sopno finish);
139 static void doemit(struct parse *p, sop op, size_t opnd);
140 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
141 static void dofwd(struct parse *p, sopno pos, sop value);
142 static int enlarge(struct parse *p, sopno size);
143 static void stripsnug(struct parse *p, struct re_guts *g);
144 static void findmust(struct parse *p, struct re_guts *g);
145 static int altoffset(sop *scan, int offset);
146 static void computejumps(struct parse *p, struct re_guts *g);
147 static void computematchjumps(struct parse *p, struct re_guts *g);
148 static sopno pluscount(struct parse *p, struct re_guts *g);
149 static wint_t wgetnext(struct parse *p);
150
151 #ifdef __cplusplus
152 }
153 #endif
154 /* ========= end header generated by ./mkh ========= */
155
156 static char nuls[10]; /* place to point scanner in event of error */
157
158 /*
159 * macros for use with parse structure
160 * BEWARE: these know that the parse structure is named `p' !!!
161 */
162 #define PEEK() (*p->next)
163 #define PEEK2() (*(p->next+1))
164 #define MORE() (p->next < p->end)
165 #define MORE2() (p->next+1 < p->end)
166 #define SEE(c) (MORE() && PEEK() == (c))
167 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
168 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
169 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
170 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
171 #define NEXT() (p->next++)
172 #define NEXT2() (p->next += 2)
173 #define NEXTn(n) (p->next += (n))
174 #define GETNEXT() (*p->next++)
175 #define WGETNEXT() wgetnext(p)
176 #define SETERROR(e) ((void)seterr(p, (e)))
177 #define REQUIRE(co, e) ((co) || seterr(p, e))
178 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
179 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
180 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
181 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
182 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
183 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
184 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
185 #define HERE() (p->slen)
186 #define THERE() (p->slen - 1)
187 #define THERETHERE() (p->slen - 2)
188 #define DROP(n) (p->slen -= (n))
189
190 #ifndef NDEBUG
191 static int never = 0; /* for use in asserts; shuts lint up */
192 #else
193 #define never 0 /* some <assert.h>s have bugs too */
194 #endif
195
196 /*
197 * regcomp - interface for parser and compilation
198 */
199 int /* 0 success, otherwise REG_something */
regcomp(regex_t * _RESTRICT_KYWD preg,const char * _RESTRICT_KYWD pattern,int cflags)200 regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
201 int cflags)
202 {
203 struct parse pa;
204 struct re_guts *g;
205 struct parse *p = &pa;
206 int i;
207 size_t len;
208 size_t maxlen;
209 #ifdef REDEBUG
210 #define GOODFLAGS(f) (f)
211 #else
212 #define GOODFLAGS(f) ((f)&~REG_DUMP)
213 #endif
214
215 /*
216 * Which character values are kept in cset bitmaps?
217 *
218 * Character sets store their members as a bitmap (for low-codepoint
219 * characters) or as elements of an array; pa.nc sets the dividing
220 * point between them.
221 *
222 * The value of MB_CUR_MAX depends on the current locale; fetching the
223 * current locale can be expensive, so we pick a value now and stick
224 * with it for the lifetime of the compiled regex.
225 */
226 pa.nc = ((MB_CUR_MAX) == 1 ? (NC_MAX) : (NC_WIDE));
227
228 /* We had REG_INVARG, but we don't have that on Solaris. */
229 cflags = GOODFLAGS(cflags);
230 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
231 return (REG_EFATAL);
232
233 if (cflags®_PEND) {
234 if (preg->re_endp < pattern)
235 return (REG_EFATAL);
236 len = preg->re_endp - pattern;
237 } else
238 len = strlen(pattern);
239
240 /* do the mallocs early so failure handling is easy */
241 g = (struct re_guts *)malloc(sizeof (struct re_guts));
242 if (g == NULL)
243 return (REG_ESPACE);
244 g->mb_cur_max = MB_CUR_MAX;
245 /*
246 * Limit the pattern space to avoid a 32-bit overflow on buffer
247 * extension. Also avoid any signed overflow in case of conversion
248 * so make the real limit based on a 31-bit overflow.
249 *
250 * Likely not applicable on 64-bit systems but handle the case
251 * generically (who are we to stop people from using ~715MB+
252 * patterns?).
253 */
254 maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3;
255 if (len >= maxlen) {
256 free((char *)g);
257 return (REG_ESPACE);
258 }
259 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
260 assert(p->ssize >= len);
261
262 p->strip = (sop *)malloc(p->ssize * sizeof (sop));
263 p->slen = 0;
264 if (p->strip == NULL) {
265 free((char *)g);
266 return (REG_ESPACE);
267 }
268
269 /* set things up */
270 p->g = g;
271 p->next = pattern; /* convenience; we do not modify it */
272 p->end = p->next + len;
273 p->error = 0;
274 p->ncsalloc = 0;
275 for (i = 0; i < NPAREN; i++) {
276 p->pbegin[i] = 0;
277 p->pend[i] = 0;
278 }
279 if (cflags & REG_EXTENDED) {
280 p->allowbranch = true;
281 p->bre = false;
282 p->parse_expr = p_ere_exp;
283 p->pre_parse = NULL;
284 p->post_parse = NULL;
285 } else {
286 p->allowbranch = false;
287 p->bre = true;
288 p->parse_expr = p_simp_re;
289 p->pre_parse = p_bre_pre_parse;
290 p->post_parse = p_bre_post_parse;
291 }
292 g->sets = NULL;
293 g->ncsets = 0;
294 g->cflags = cflags;
295 g->iflags = 0;
296 g->nbol = 0;
297 g->neol = 0;
298 g->must = NULL;
299 g->moffset = -1;
300 g->charjump = NULL;
301 g->matchjump = NULL;
302 g->mlen = 0;
303 g->nsub = 0;
304 g->backrefs = 0;
305
306 /* do it */
307 EMIT(OEND, 0);
308 g->firststate = THERE();
309 if (cflags & REG_NOSPEC)
310 p_str(p);
311 else
312 p_re(p, OUT, OUT);
313 EMIT(OEND, 0);
314 g->laststate = THERE();
315
316 /* tidy up loose ends and fill things in */
317 stripsnug(p, g);
318 findmust(p, g);
319 /*
320 * only use Boyer-Moore algorithm if the pattern is bigger
321 * than three characters
322 */
323 if (g->mlen > 3) {
324 computejumps(p, g);
325 computematchjumps(p, g);
326 if (g->matchjump == NULL && g->charjump != NULL) {
327 free(g->charjump);
328 g->charjump = NULL;
329 }
330 }
331 g->nplus = pluscount(p, g);
332 g->magic = MAGIC2;
333 preg->re_nsub = g->nsub;
334 preg->re_g = g;
335 preg->re_magic = MAGIC1;
336 #ifndef REDEBUG
337 /* not debugging, so can't rely on the assert() in regexec() */
338 if (g->iflags&BAD)
339 SETERROR(REG_EFATAL);
340 #endif
341
342 /* win or lose, we're done */
343 if (p->error != 0) /* lose */
344 regfree(preg);
345 return (p->error);
346 }
347
348 /*
349 * Parse one subERE, an atom possibly followed by a repetition op,
350 * return whether we should terminate or not.
351 */
352 static bool
p_ere_exp(struct parse * p,struct branchc * bc)353 p_ere_exp(struct parse *p, struct branchc *bc)
354 {
355 char c;
356 wint_t wc;
357 sopno pos;
358 int count;
359 int count2;
360 sopno subno;
361 int wascaret = 0;
362
363 (void) bc;
364 assert(MORE()); /* caller should have ensured this */
365 c = GETNEXT();
366
367 pos = HERE();
368 switch (c) {
369 case '(':
370 (void) REQUIRE(MORE(), REG_EPAREN);
371 p->g->nsub++;
372 subno = p->g->nsub;
373 if (subno < NPAREN)
374 p->pbegin[subno] = HERE();
375 EMIT(OLPAREN, subno);
376 if (!SEE(')'))
377 p_re(p, ')', IGN);
378 if (subno < NPAREN) {
379 p->pend[subno] = HERE();
380 assert(p->pend[subno] != 0);
381 }
382 EMIT(ORPAREN, subno);
383 (void) MUSTEAT(')', REG_EPAREN);
384 break;
385 #ifndef POSIX_MISTAKE
386 case ')': /* happens only if no current unmatched ( */
387 /*
388 * You may ask, why the ifndef? Because I didn't notice
389 * this until slightly too late for 1003.2, and none of the
390 * other 1003.2 regular-expression reviewers noticed it at
391 * all. So an unmatched ) is legal POSIX, at least until
392 * we can get it fixed.
393 */
394 SETERROR(REG_EPAREN);
395 break;
396 #endif
397 case '^':
398 EMIT(OBOL, 0);
399 p->g->iflags |= USEBOL;
400 p->g->nbol++;
401 wascaret = 1;
402 break;
403 case '$':
404 EMIT(OEOL, 0);
405 p->g->iflags |= USEEOL;
406 p->g->neol++;
407 break;
408 case '|':
409 SETERROR(REG_BADPAT);
410 break;
411 case '*':
412 case '+':
413 case '?':
414 case '{':
415 SETERROR(REG_BADRPT);
416 break;
417 case '.':
418 if (p->g->cflags®_NEWLINE)
419 nonnewline(p);
420 else
421 EMIT(OANY, 0);
422 break;
423 case '[':
424 p_bracket(p);
425 break;
426 case '\\':
427 (void) REQUIRE(MORE(), REG_EESCAPE);
428 wc = WGETNEXT();
429 switch (wc) {
430 case '<':
431 EMIT(OBOW, 0);
432 break;
433 case '>':
434 EMIT(OEOW, 0);
435 break;
436 default:
437 ordinary(p, wc);
438 break;
439 }
440 break;
441 default:
442 if (p->error != 0)
443 return (false);
444 p->next--;
445 wc = WGETNEXT();
446 ordinary(p, wc);
447 break;
448 }
449
450 if (!MORE())
451 return (false);
452 c = PEEK();
453 /* we call { a repetition if followed by a digit */
454 if (!(c == '*' || c == '+' || c == '?' || c == '{'))
455 return (false); /* no repetition, we're done */
456 else if (c == '{')
457 (void) REQUIRE(MORE2() && \
458 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
459 NEXT();
460
461 (void) REQUIRE(!wascaret, REG_BADRPT);
462 switch (c) {
463 case '*': /* implemented as +? */
464 /* this case does not require the (y|) trick, noKLUDGE */
465 INSERT(OPLUS_, pos);
466 ASTERN(O_PLUS, pos);
467 INSERT(OQUEST_, pos);
468 ASTERN(O_QUEST, pos);
469 break;
470 case '+':
471 INSERT(OPLUS_, pos);
472 ASTERN(O_PLUS, pos);
473 break;
474 case '?':
475 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
476 INSERT(OCH_, pos); /* offset slightly wrong */
477 ASTERN(OOR1, pos); /* this one's right */
478 AHEAD(pos); /* fix the OCH_ */
479 EMIT(OOR2, 0); /* offset very wrong... */
480 AHEAD(THERE()); /* ...so fix it */
481 ASTERN(O_CH, THERETHERE());
482 break;
483 case '{':
484 count = p_count(p);
485 if (EAT(',')) {
486 if (isdigit((uch)PEEK())) {
487 count2 = p_count(p);
488 (void) REQUIRE(count <= count2, REG_BADBR);
489 } else /* single number with comma */
490 count2 = INFINITY;
491 } else /* just a single number */
492 count2 = count;
493 repeat(p, pos, count, count2);
494 if (!EAT('}')) { /* error heuristics */
495 while (MORE() && PEEK() != '}')
496 NEXT();
497 (void) REQUIRE(MORE(), REG_EBRACE);
498 SETERROR(REG_BADBR);
499 }
500 break;
501 }
502
503 if (!MORE())
504 return (false);
505 c = PEEK();
506 if (!(c == '*' || c == '+' || c == '?' ||
507 (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
508 return (false);
509 SETERROR(REG_BADRPT);
510 return (false);
511 }
512
513 /*
514 * p_str - string (no metacharacters) "parser"
515 */
516 static void
p_str(struct parse * p)517 p_str(struct parse *p)
518 {
519 (void) REQUIRE(MORE(), REG_BADPAT);
520 while (MORE())
521 ordinary(p, WGETNEXT());
522 }
523
524 /*
525 * Eat consecutive branch delimiters for the kind of expression that we are
526 * parsing, return the number of delimiters that we ate.
527 */
528 static int
p_branch_eat_delim(struct parse * p,struct branchc * bc)529 p_branch_eat_delim(struct parse *p, struct branchc *bc)
530 {
531 int nskip;
532
533 (void) bc;
534 nskip = 0;
535 while (EAT('|'))
536 ++nskip;
537 return (nskip);
538 }
539
540 /*
541 * Insert necessary branch book-keeping operations. This emits a
542 * bogus 'next' offset, since we still have more to parse
543 */
544 static void
p_branch_ins_offset(struct parse * p,struct branchc * bc)545 p_branch_ins_offset(struct parse *p, struct branchc *bc)
546 {
547 if (bc->nbranch == 0) {
548 INSERT(OCH_, bc->start); /* offset is wrong */
549 bc->fwd = bc->start;
550 bc->back = bc->start;
551 }
552
553 ASTERN(OOR1, bc->back);
554 bc->back = THERE();
555 AHEAD(bc->fwd); /* fix previous offset */
556 bc->fwd = HERE();
557 EMIT(OOR2, 0); /* offset is very wrong */
558 ++bc->nbranch;
559 }
560
561 /*
562 * Fix the offset of the tail branch, if we actually had any branches.
563 * This is to correct the bogus placeholder offset that we use.
564 */
565 static void
p_branch_fix_tail(struct parse * p,struct branchc * bc)566 p_branch_fix_tail(struct parse *p, struct branchc *bc)
567 {
568 /* Fix bogus offset at the tail if we actually have branches */
569 if (bc->nbranch > 0) {
570 AHEAD(bc->fwd);
571 ASTERN(O_CH, bc->back);
572 }
573 }
574
575 /*
576 * Signal to the parser that an empty branch has been encountered; this will,
577 * in the future, be used to allow for more permissive behavior with empty
578 * branches. The return value should indicate whether parsing may continue
579 * or not.
580 */
581 static bool
p_branch_empty(struct parse * p,struct branchc * bc)582 p_branch_empty(struct parse *p, struct branchc *bc)
583 {
584 (void) bc;
585 SETERROR(REG_BADPAT);
586 return (false);
587 }
588
589 /*
590 * Take care of any branching requirements. This includes inserting the
591 * appropriate branching instructions as well as eating all of the branch
592 * delimiters until we either run out of pattern or need to parse more pattern.
593 */
594 static bool
p_branch_do(struct parse * p,struct branchc * bc)595 p_branch_do(struct parse *p, struct branchc *bc)
596 {
597 int ate = 0;
598
599 ate = p_branch_eat_delim(p, bc);
600 if (ate == 0)
601 return (false);
602 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
603 /*
604 * Halt parsing only if we have an empty branch and
605 * p_branch_empty indicates that we must not continue.
606 * In the future, this will not necessarily be an error.
607 */
608 return (false);
609 p_branch_ins_offset(p, bc);
610
611 return (true);
612 }
613
614 static void
p_bre_pre_parse(struct parse * p,struct branchc * bc)615 p_bre_pre_parse(struct parse *p, struct branchc *bc)
616 {
617 (void) bc;
618 /*
619 * Does not move cleanly into expression parser because of
620 * ordinary interpration of * at the beginning position of
621 * an expression.
622 */
623 if (EAT('^')) {
624 EMIT(OBOL, 0);
625 p->g->iflags |= USEBOL;
626 p->g->nbol++;
627 }
628 }
629
630 static void
p_bre_post_parse(struct parse * p,struct branchc * bc)631 p_bre_post_parse(struct parse *p, struct branchc *bc)
632 {
633 /* Expression is terminating due to EOL token */
634 if (bc->terminate) {
635 DROP(1);
636 EMIT(OEOL, 0);
637 p->g->iflags |= USEEOL;
638 p->g->neol++;
639 }
640 }
641
642 /*
643 * Top level parser, concatenation and BRE anchoring.
644 * Giving end1 as OUT essentially eliminates the end1/end2 check.
645 *
646 * This implementation is a bit of a kludge, in that a trailing $ is first
647 * taken as an ordinary character and then revised to be an anchor.
648 * The amount of lookahead needed to avoid this kludge is excessive.
649 */
650 static void
p_re(struct parse * p,int end1,int end2)651 p_re(struct parse *p,
652 int end1, /* first terminating character */
653 int end2) /* second terminating character; ignored for EREs */
654 {
655 struct branchc bc;
656
657 bc.nbranch = 0;
658 if (end1 == OUT && end2 == OUT)
659 bc.outer = true;
660 else
661 bc.outer = false;
662 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
663 for (;;) {
664 bc.start = HERE();
665 bc.nchain = 0;
666 bc.terminate = false;
667 if (p->pre_parse != NULL)
668 p->pre_parse(p, &bc);
669 while (MORE() && (!p->allowbranch || !SEESPEC('|')) &&
670 !SEEEND()) {
671 bc.terminate = p->parse_expr(p, &bc);
672 ++bc.nchain;
673 }
674 if (p->post_parse != NULL)
675 p->post_parse(p, &bc);
676 (void) REQUIRE(HERE() != bc.start, REG_BADPAT);
677 if (!p->allowbranch)
678 break;
679 /*
680 * p_branch_do's return value indicates whether we should
681 * continue parsing or not. This is both for correctness and
682 * a slight optimization, because it will check if we've
683 * encountered an empty branch or the end of the string
684 * immediately following a branch delimiter.
685 */
686 if (!p_branch_do(p, &bc))
687 break;
688 }
689 #undef SEE_END
690 if (p->allowbranch)
691 p_branch_fix_tail(p, &bc);
692 assert(!MORE() || SEE(end1));
693 }
694
695 /*
696 * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
697 */
698 static bool /* was the simple RE an unbackslashed $? */
p_simp_re(struct parse * p,struct branchc * bc)699 p_simp_re(struct parse *p, struct branchc *bc)
700 {
701 int c;
702 int count;
703 int count2;
704 sopno pos;
705 int i;
706 wint_t wc;
707 sopno subno;
708 #define BACKSL (1<<CHAR_BIT)
709
710 pos = HERE(); /* repetition op, if any, covers from here */
711
712 assert(MORE()); /* caller should have ensured this */
713 c = GETNEXT();
714 if (c == '\\') {
715 (void) REQUIRE(MORE(), REG_EESCAPE);
716 c = BACKSL | GETNEXT();
717 }
718 switch (c) {
719 case '.':
720 if (p->g->cflags®_NEWLINE)
721 nonnewline(p);
722 else
723 EMIT(OANY, 0);
724 break;
725 case '[':
726 p_bracket(p);
727 break;
728 case BACKSL|'<':
729 EMIT(OBOW, 0);
730 break;
731 case BACKSL|'>':
732 EMIT(OEOW, 0);
733 break;
734 case BACKSL|'{':
735 SETERROR(REG_BADRPT);
736 break;
737 case BACKSL|'(':
738 p->g->nsub++;
739 subno = p->g->nsub;
740 if (subno < NPAREN)
741 p->pbegin[subno] = HERE();
742 EMIT(OLPAREN, subno);
743 /* the MORE here is an error heuristic */
744 if (MORE() && !SEETWO('\\', ')'))
745 p_re(p, '\\', ')');
746 if (subno < NPAREN) {
747 p->pend[subno] = HERE();
748 assert(p->pend[subno] != 0);
749 }
750 EMIT(ORPAREN, subno);
751 (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
752 break;
753 case BACKSL|')': /* should not get here -- must be user */
754 SETERROR(REG_EPAREN);
755 break;
756 case BACKSL|'1':
757 case BACKSL|'2':
758 case BACKSL|'3':
759 case BACKSL|'4':
760 case BACKSL|'5':
761 case BACKSL|'6':
762 case BACKSL|'7':
763 case BACKSL|'8':
764 case BACKSL|'9':
765 i = (c&~BACKSL) - '0';
766 assert(i < NPAREN);
767 if (p->pend[i] != 0) {
768 assert(i <= p->g->nsub);
769 EMIT(OBACK_, i);
770 assert(p->pbegin[i] != 0);
771 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
772 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
773 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
774 EMIT(O_BACK, i);
775 } else
776 SETERROR(REG_ESUBREG);
777 p->g->backrefs = 1;
778 break;
779 case '*':
780 /*
781 * Ordinary if used as the first character beyond BOL anchor of
782 * a (sub-)expression, counts as a bad repetition operator if it
783 * appears otherwise.
784 */
785 (void) REQUIRE(bc->nchain == 0, REG_BADRPT);
786 /* FALLTHROUGH */
787 default:
788 if (p->error != 0)
789 return (false); /* Definitely not $... */
790 p->next--;
791 wc = WGETNEXT();
792 ordinary(p, wc);
793 break;
794 }
795
796 if (EAT('*')) { /* implemented as +? */
797 /* this case does not require the (y|) trick, noKLUDGE */
798 INSERT(OPLUS_, pos);
799 ASTERN(O_PLUS, pos);
800 INSERT(OQUEST_, pos);
801 ASTERN(O_QUEST, pos);
802 } else if (EATTWO('\\', '{')) {
803 count = p_count(p);
804 if (EAT(',')) {
805 if (MORE() && isdigit((uch)PEEK())) {
806 count2 = p_count(p);
807 (void) REQUIRE(count <= count2, REG_BADBR);
808 } else /* single number with comma */
809 count2 = INFINITY;
810 } else /* just a single number */
811 count2 = count;
812 repeat(p, pos, count, count2);
813 if (!EATTWO('\\', '}')) { /* error heuristics */
814 while (MORE() && !SEETWO('\\', '}'))
815 NEXT();
816 (void) REQUIRE(MORE(), REG_EBRACE);
817 SETERROR(REG_BADBR);
818 }
819 } else if (c == '$') /* $ (but not \$) ends it */
820 return (true);
821
822 return (false);
823 }
824
825 /*
826 * p_count - parse a repetition count
827 */
828 static int /* the value */
p_count(struct parse * p)829 p_count(struct parse *p)
830 {
831 int count = 0;
832 int ndigits = 0;
833
834 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
835 count = count*10 + (GETNEXT() - '0');
836 ndigits++;
837 }
838
839 (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
840 return (count);
841 }
842
843 /*
844 * p_bracket - parse a bracketed character list
845 */
846 static void
p_bracket(struct parse * p)847 p_bracket(struct parse *p)
848 {
849 cset *cs;
850 wint_t ch;
851
852 /* Dept of Truly Sickening Special-Case Kludges */
853 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
854 EMIT(OBOW, 0);
855 NEXTn(6);
856 return;
857 }
858 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
859 EMIT(OEOW, 0);
860 NEXTn(6);
861 return;
862 }
863
864 if ((cs = allocset(p)) == NULL)
865 return;
866
867 if (p->g->cflags®_ICASE)
868 cs->icase = 1;
869 if (EAT('^'))
870 cs->invert = 1;
871 if (EAT(']'))
872 CHadd(p, cs, ']');
873 else if (EAT('-'))
874 CHadd(p, cs, '-');
875 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
876 p_b_term(p, cs);
877 if (EAT('-'))
878 CHadd(p, cs, '-');
879 (void) MUSTEAT(']', REG_EBRACK);
880
881 if (p->error != 0) /* don't mess things up further */
882 return;
883
884 if (cs->invert && p->g->cflags®_NEWLINE)
885 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
886
887 if ((ch = singleton(p, cs)) != OUT) { /* optimize singleton sets */
888 ordinary(p, ch);
889 freeset(p, cs);
890 } else
891 EMIT(OANYOF, (int)(cs - p->g->sets));
892 }
893
894 /*
895 * p_b_term - parse one term of a bracketed character list
896 */
897 static void
p_b_term(struct parse * p,cset * cs)898 p_b_term(struct parse *p, cset *cs)
899 {
900 char c;
901 wint_t start, finish;
902 wint_t i;
903 locale_t loc = uselocale(NULL);
904
905 /* classify what we've got */
906 switch ((MORE()) ? PEEK() : '\0') {
907 case '[':
908 c = (MORE2()) ? PEEK2() : '\0';
909 break;
910 case '-':
911 SETERROR(REG_ERANGE);
912 return; /* NOTE RETURN */
913 default:
914 c = '\0';
915 break;
916 }
917
918 switch (c) {
919 case ':': /* character class */
920 NEXT2();
921 (void) REQUIRE(MORE(), REG_EBRACK);
922 c = PEEK();
923 (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
924 p_b_cclass(p, cs);
925 (void) REQUIRE(MORE(), REG_EBRACK);
926 (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
927 break;
928 case '=': /* equivalence class */
929 NEXT2();
930 (void) REQUIRE(MORE(), REG_EBRACK);
931 c = PEEK();
932 (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
933 p_b_eclass(p, cs);
934 (void) REQUIRE(MORE(), REG_EBRACK);
935 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
936 break;
937 default: /* symbol, ordinary character, or range */
938 start = p_b_symbol(p);
939 if (SEE('-') && MORE2() && PEEK2() != ']') {
940 /* range */
941 NEXT();
942 if (EAT('-'))
943 finish = '-';
944 else
945 finish = p_b_symbol(p);
946 } else
947 finish = start;
948 if (start == finish)
949 CHadd(p, cs, start);
950 else {
951 if (loc->collate->lc_is_posix) {
952 (void) REQUIRE((uch)start <= (uch)finish,
953 REG_ERANGE);
954 CHaddrange(p, cs, start, finish);
955 } else {
956 (void) REQUIRE(_collate_range_cmp(start,
957 finish, loc) <= 0, REG_ERANGE);
958 for (i = 0; i <= UCHAR_MAX; i++) {
959 if (_collate_range_cmp(start, i, loc)
960 <= 0 &&
961 _collate_range_cmp(i, finish, loc)
962 <= 0)
963 CHadd(p, cs, i);
964 }
965 }
966 }
967 break;
968 }
969 }
970
971 /*
972 * p_b_cclass - parse a character-class name and deal with it
973 */
974 static void
p_b_cclass(struct parse * p,cset * cs)975 p_b_cclass(struct parse *p, cset *cs)
976 {
977 const char *sp = p->next;
978 size_t len;
979 wctype_t wct;
980 char clname[16];
981
982 while (MORE() && isalpha((uch)PEEK()))
983 NEXT();
984 len = p->next - sp;
985 if (len >= sizeof (clname) - 1) {
986 SETERROR(REG_ECTYPE);
987 return;
988 }
989 (void) memcpy(clname, sp, len);
990 clname[len] = '\0';
991 if ((wct = wctype(clname)) == 0) {
992 SETERROR(REG_ECTYPE);
993 return;
994 }
995 CHaddtype(p, cs, wct);
996 }
997
998 /*
999 * p_b_eclass - parse an equivalence-class name and deal with it
1000 *
1001 * This implementation is incomplete. xxx
1002 */
1003 static void
p_b_eclass(struct parse * p,cset * cs)1004 p_b_eclass(struct parse *p, cset *cs)
1005 {
1006 wint_t c;
1007
1008 c = p_b_coll_elem(p, '=');
1009 CHadd(p, cs, c);
1010 }
1011
1012 /*
1013 * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1014 */
1015 static wint_t /* value of symbol */
p_b_symbol(struct parse * p)1016 p_b_symbol(struct parse *p)
1017 {
1018 wint_t value;
1019
1020 (void) REQUIRE(MORE(), REG_EBRACK);
1021 if (!EATTWO('[', '.'))
1022 return (WGETNEXT());
1023
1024 /* collating symbol */
1025 value = p_b_coll_elem(p, '.');
1026 (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1027 return (value);
1028 }
1029
1030 /*
1031 * p_b_coll_elem - parse a collating-element name and look it up
1032 */
1033 static wint_t /* value of collating element */
p_b_coll_elem(struct parse * p,wint_t endc)1034 p_b_coll_elem(struct parse *p,
1035 wint_t endc) /* name ended by endc,']' */
1036 {
1037 const char *sp = p->next;
1038 struct cname *cp;
1039 mbstate_t mbs;
1040 wchar_t wc;
1041 size_t clen, len;
1042
1043 while (MORE() && !SEETWO(endc, ']'))
1044 NEXT();
1045 if (!MORE()) {
1046 SETERROR(REG_EBRACK);
1047 return (0);
1048 }
1049 len = p->next - sp;
1050 for (cp = cnames; cp->name != NULL; cp++)
1051 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1052 return (cp->code); /* known name */
1053 (void) memset(&mbs, 0, sizeof (mbs));
1054 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1055 return (wc); /* single character */
1056 else if (clen == (size_t)-1 || clen == (size_t)-2)
1057 SETERROR(REG_ECHAR);
1058 else
1059 SETERROR(REG_ECOLLATE); /* neither */
1060 return (0);
1061 }
1062
1063 /*
1064 * othercase - return the case counterpart of an alphabetic
1065 */
1066 static wint_t /* if no counterpart, return ch */
othercase(wint_t ch)1067 othercase(wint_t ch)
1068 {
1069 assert(iswalpha(ch));
1070 if (iswupper(ch))
1071 return (towlower(ch));
1072 else if (iswlower(ch))
1073 return (towupper(ch));
1074 else /* peculiar, but could happen */
1075 return (ch);
1076 }
1077
1078 /*
1079 * bothcases - emit a dualcase version of a two-case character
1080 *
1081 * Boy, is this implementation ever a kludge...
1082 */
1083 static void
bothcases(struct parse * p,wint_t ch)1084 bothcases(struct parse *p, wint_t ch)
1085 {
1086 const char *oldnext = p->next;
1087 const char *oldend = p->end;
1088 char bracket[3 + MB_LEN_MAX];
1089 size_t n;
1090 mbstate_t mbs;
1091
1092 assert(othercase(ch) != ch); /* p_bracket() would recurse */
1093 p->next = bracket;
1094 (void) memset(&mbs, 0, sizeof (mbs));
1095 n = wcrtomb(bracket, ch, &mbs);
1096 assert(n != (size_t)-1);
1097 bracket[n] = ']';
1098 bracket[n + 1] = '\0';
1099 p->end = bracket+n+1;
1100 p_bracket(p);
1101 assert(p->next == p->end);
1102 p->next = oldnext;
1103 p->end = oldend;
1104 }
1105
1106 /*
1107 * ordinary - emit an ordinary character
1108 */
1109 static void
ordinary(struct parse * p,wint_t ch)1110 ordinary(struct parse *p, wint_t ch)
1111 {
1112 cset *cs;
1113
1114 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1115 bothcases(p, ch);
1116 else if ((ch & OPDMASK) == ch)
1117 EMIT(OCHAR, ch);
1118 else {
1119 /*
1120 * Kludge: character is too big to fit into an OCHAR operand.
1121 * Emit a singleton set.
1122 */
1123 if ((cs = allocset(p)) == NULL)
1124 return;
1125 CHadd(p, cs, ch);
1126 EMIT(OANYOF, (int)(cs - p->g->sets));
1127 }
1128 }
1129
1130 /*
1131 * nonnewline - emit REG_NEWLINE version of OANY
1132 *
1133 * Boy, is this implementation ever a kludge...
1134 */
1135 static void
nonnewline(struct parse * p)1136 nonnewline(struct parse *p)
1137 {
1138 const char *oldnext = p->next;
1139 const char *oldend = p->end;
1140 char bracket[4];
1141
1142 p->next = bracket;
1143 p->end = bracket+3;
1144 bracket[0] = '^';
1145 bracket[1] = '\n';
1146 bracket[2] = ']';
1147 bracket[3] = '\0';
1148 p_bracket(p);
1149 assert(p->next == bracket+3);
1150 p->next = oldnext;
1151 p->end = oldend;
1152 }
1153
1154 /*
1155 * repeat - generate code for a bounded repetition, recursively if needed
1156 */
1157 static void
repeat(struct parse * p,sopno start,int from,int to)1158 repeat(struct parse *p,
1159 sopno start, /* operand from here to end of strip */
1160 int from, /* repeated from this number */
1161 int to) /* to this number of times (maybe INFINITY) */
1162 {
1163 sopno finish = HERE();
1164 #define N 2
1165 #define INF 3
1166 #define REP(f, t) ((f)*8 + (t))
1167 #define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1168 sopno copy;
1169
1170 if (p->error != 0) /* head off possible runaway recursion */
1171 return;
1172
1173 assert(from <= to);
1174
1175 switch (REP(MAP(from), MAP(to))) {
1176 case REP(0, 0): /* must be user doing this */
1177 DROP(finish-start); /* drop the operand */
1178 break;
1179 case REP(0, 1): /* as x{1,1}? */
1180 case REP(0, N): /* as x{1,n}? */
1181 case REP(0, INF): /* as x{1,}? */
1182 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1183 INSERT(OCH_, start); /* offset is wrong... */
1184 repeat(p, start+1, 1, to);
1185 ASTERN(OOR1, start);
1186 AHEAD(start); /* ... fix it */
1187 EMIT(OOR2, 0);
1188 AHEAD(THERE());
1189 ASTERN(O_CH, THERETHERE());
1190 break;
1191 case REP(1, 1): /* trivial case */
1192 /* done */
1193 break;
1194 case REP(1, N): /* as x?x{1,n-1} */
1195 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1196 INSERT(OCH_, start);
1197 ASTERN(OOR1, start);
1198 AHEAD(start);
1199 EMIT(OOR2, 0); /* offset very wrong... */
1200 AHEAD(THERE()); /* ...so fix it */
1201 ASTERN(O_CH, THERETHERE());
1202 copy = dupl(p, start+1, finish+1);
1203 assert(copy == finish+4);
1204 repeat(p, copy, 1, to-1);
1205 break;
1206 case REP(1, INF): /* as x+ */
1207 INSERT(OPLUS_, start);
1208 ASTERN(O_PLUS, start);
1209 break;
1210 case REP(N, N): /* as xx{m-1,n-1} */
1211 copy = dupl(p, start, finish);
1212 repeat(p, copy, from-1, to-1);
1213 break;
1214 case REP(N, INF): /* as xx{n-1,INF} */
1215 copy = dupl(p, start, finish);
1216 repeat(p, copy, from-1, to);
1217 break;
1218 default: /* "can't happen" */
1219 SETERROR(REG_EFATAL); /* just in case */
1220 break;
1221 }
1222 }
1223
1224 /*
1225 * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1226 * character from the parse struct, signals a REG_ILLSEQ error if the
1227 * character can't be converted. Returns the number of bytes consumed.
1228 */
1229 static wint_t
wgetnext(struct parse * p)1230 wgetnext(struct parse *p)
1231 {
1232 mbstate_t mbs;
1233 wchar_t wc;
1234 size_t n;
1235
1236 (void) memset(&mbs, 0, sizeof (mbs));
1237 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1238 if (n == (size_t)-1 || n == (size_t)-2) {
1239 SETERROR(REG_ECHAR);
1240 return (0);
1241 }
1242 if (n == 0)
1243 n = 1;
1244 p->next += n;
1245 return (wc);
1246 }
1247
1248 /*
1249 * seterr - set an error condition
1250 */
1251 static int /* useless but makes type checking happy */
seterr(struct parse * p,int e)1252 seterr(struct parse *p, int e)
1253 {
1254 if (p->error == 0) /* keep earliest error condition */
1255 p->error = e;
1256 p->next = nuls; /* try to bring things to a halt */
1257 p->end = nuls;
1258 return (0); /* make the return value well-defined */
1259 }
1260
1261 /*
1262 * allocset - allocate a set of characters for []
1263 */
1264 static cset *
allocset(struct parse * p)1265 allocset(struct parse *p)
1266 {
1267 cset *cs, *ncs;
1268
1269 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
1270 if (ncs == NULL) {
1271 SETERROR(REG_ESPACE);
1272 return (NULL);
1273 }
1274 p->g->sets = ncs;
1275 cs = &p->g->sets[p->g->ncsets++];
1276 (void) memset(cs, 0, sizeof (*cs));
1277
1278 return (cs);
1279 }
1280
1281 /*
1282 * freeset - free a now-unused set
1283 */
1284 static void
freeset(struct parse * p,cset * cs)1285 freeset(struct parse *p, cset *cs)
1286 {
1287 cset *top = &p->g->sets[p->g->ncsets];
1288
1289 free(cs->wides);
1290 free(cs->ranges);
1291 free(cs->types);
1292 (void) memset(cs, 0, sizeof (*cs));
1293 if (cs == top-1) /* recover only the easy case */
1294 p->g->ncsets--;
1295 }
1296
1297 /*
1298 * singleton - Determine whether a set contains only one character,
1299 * returning it if so, otherwise returning OUT.
1300 */
1301 static wint_t
singleton(struct parse * p,cset * cs)1302 singleton(struct parse *p, cset *cs)
1303 {
1304 wint_t i, s, n;
1305
1306 /* Exclude the complicated cases we don't want to deal with */
1307 if (cs->nranges != 0 || cs->ntypes != 0 || cs->icase != 0)
1308 return (OUT);
1309
1310 if (cs->nwides > 1)
1311 return (OUT);
1312
1313 /* Count the number of characters present in the bitmap */
1314 for (i = n = 0; i < p->nc; i++)
1315 if (CHIN(p->nc, cs, i)) {
1316 n++;
1317 s = i;
1318 }
1319
1320 if (n > 1)
1321 return (OUT);
1322
1323 if (n == 1) {
1324 if (cs->nwides == 0)
1325 return (s);
1326 else
1327 return (OUT);
1328 }
1329 if (cs->nwides == 1)
1330 return (cs->wides[0]);
1331
1332 return (OUT);
1333 }
1334
1335 /*
1336 * CHadd - add character to character set.
1337 */
1338 static void
CHadd(struct parse * p,cset * cs,wint_t ch)1339 CHadd(struct parse *p, cset *cs, wint_t ch)
1340 {
1341 wint_t nch, *newwides;
1342 assert(ch >= 0);
1343 if (ch < p->nc)
1344 cs->bmp[ch >> 3] |= 1 << (ch & 7);
1345 else {
1346 newwides = realloc(cs->wides, (cs->nwides + 1) *
1347 sizeof (*cs->wides));
1348 if (newwides == NULL) {
1349 SETERROR(REG_ESPACE);
1350 return;
1351 }
1352 cs->wides = newwides;
1353 cs->wides[cs->nwides++] = ch;
1354 }
1355 if (cs->icase) {
1356 if ((nch = towlower(ch)) < p->nc)
1357 cs->bmp[nch >> 3] |= 1 << (nch & 7);
1358 if ((nch = towupper(ch)) < p->nc)
1359 cs->bmp[nch >> 3] |= 1 << (nch & 7);
1360 }
1361 }
1362
1363 /*
1364 * CHaddrange - add all characters in the range [min,max] to a character set.
1365 */
1366 static void
CHaddrange(struct parse * p,cset * cs,wint_t min,wint_t max)1367 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1368 {
1369 crange *newranges;
1370
1371 for (; min < p->nc && min <= max; min++)
1372 CHadd(p, cs, min);
1373 if (min >= max)
1374 return;
1375 newranges = realloc(cs->ranges, (cs->nranges + 1) *
1376 sizeof (*cs->ranges));
1377 if (newranges == NULL) {
1378 SETERROR(REG_ESPACE);
1379 return;
1380 }
1381 cs->ranges = newranges;
1382 cs->ranges[cs->nranges].min = min;
1383 cs->ranges[cs->nranges].max = max;
1384 cs->nranges++;
1385 }
1386
1387 /*
1388 * CHaddtype - add all characters of a certain type to a character set.
1389 */
1390 static void
CHaddtype(struct parse * p,cset * cs,wctype_t wct)1391 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1392 {
1393 wint_t i;
1394 wctype_t *newtypes;
1395
1396 for (i = 0; i < p->nc; i++)
1397 if (iswctype(i, wct))
1398 CHadd(p, cs, i);
1399 newtypes = realloc(cs->types, (cs->ntypes + 1) *
1400 sizeof (*cs->types));
1401 if (newtypes == NULL) {
1402 SETERROR(REG_ESPACE);
1403 return;
1404 }
1405 cs->types = newtypes;
1406 cs->types[cs->ntypes++] = wct;
1407 }
1408
1409 /*
1410 * dupl - emit a duplicate of a bunch of sops
1411 */
1412 static sopno /* start of duplicate */
dupl(struct parse * p,sopno start,sopno finish)1413 dupl(struct parse *p,
1414 sopno start, /* from here */
1415 sopno finish) /* to this less one */
1416 {
1417 sopno ret = HERE();
1418 sopno len = finish - start;
1419
1420 assert(finish >= start);
1421 if (len == 0)
1422 return (ret);
1423 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1424 return (ret);
1425 assert(p->ssize >= p->slen + len);
1426 (void) memcpy((char *)(p->strip + p->slen),
1427 (char *)(p->strip + start), (size_t)len*sizeof (sop));
1428 p->slen += len;
1429 return (ret);
1430 }
1431
1432 /*
1433 * doemit - emit a strip operator
1434 *
1435 * It might seem better to implement this as a macro with a function as
1436 * hard-case backup, but it's just too big and messy unless there are
1437 * some changes to the data structures. Maybe later.
1438 */
1439 static void
doemit(struct parse * p,sop op,size_t opnd)1440 doemit(struct parse *p, sop op, size_t opnd)
1441 {
1442 /* avoid making error situations worse */
1443 if (p->error != 0)
1444 return;
1445
1446 /* deal with oversize operands ("can't happen", more or less) */
1447 assert(opnd < 1<<OPSHIFT);
1448
1449 /* deal with undersized strip */
1450 if (p->slen >= p->ssize)
1451 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1452 return;
1453
1454 /* finally, it's all reduced to the easy case */
1455 p->strip[p->slen++] = SOP(op, opnd);
1456 }
1457
1458 /*
1459 * doinsert - insert a sop into the strip
1460 */
1461 static void
doinsert(struct parse * p,sop op,size_t opnd,sopno pos)1462 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1463 {
1464 sopno sn;
1465 sop s;
1466 int i;
1467
1468 /* avoid making error situations worse */
1469 if (p->error != 0)
1470 return;
1471
1472 sn = HERE();
1473 EMIT(op, opnd); /* do checks, ensure space */
1474 assert(HERE() == sn+1);
1475 s = p->strip[sn];
1476
1477 /* adjust paren pointers */
1478 assert(pos > 0);
1479 for (i = 1; i < NPAREN; i++) {
1480 if (p->pbegin[i] >= pos) {
1481 p->pbegin[i]++;
1482 }
1483 if (p->pend[i] >= pos) {
1484 p->pend[i]++;
1485 }
1486 }
1487
1488 (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1489 (HERE()-pos-1)*sizeof (sop));
1490 p->strip[pos] = s;
1491 }
1492
1493 /*
1494 * dofwd - complete a forward reference
1495 */
1496 static void
dofwd(struct parse * p,sopno pos,sop value)1497 dofwd(struct parse *p, sopno pos, sop value)
1498 {
1499 /* avoid making error situations worse */
1500 if (p->error != 0)
1501 return;
1502
1503 assert(value < 1<<OPSHIFT);
1504 p->strip[pos] = OP(p->strip[pos]) | value;
1505 }
1506
1507 /*
1508 * enlarge - enlarge the strip
1509 */
1510 static int
enlarge(struct parse * p,sopno size)1511 enlarge(struct parse *p, sopno size)
1512 {
1513 sop *sp;
1514
1515 if (p->ssize >= size)
1516 return (1);
1517
1518 sp = (sop *)realloc(p->strip, size*sizeof (sop));
1519 if (sp == NULL) {
1520 SETERROR(REG_ESPACE);
1521 return (0);
1522 }
1523 p->strip = sp;
1524 p->ssize = size;
1525 return (1);
1526 }
1527
1528 /*
1529 * stripsnug - compact the strip
1530 */
1531 static void
stripsnug(struct parse * p,struct re_guts * g)1532 stripsnug(struct parse *p, struct re_guts *g)
1533 {
1534 g->nstates = p->slen;
1535 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
1536 if (g->strip == NULL) {
1537 SETERROR(REG_ESPACE);
1538 g->strip = p->strip;
1539 }
1540 }
1541
1542 /*
1543 * findmust - fill in must and mlen with longest mandatory literal string
1544 *
1545 * This algorithm could do fancy things like analyzing the operands of |
1546 * for common subsequences. Someday. This code is simple and finds most
1547 * of the interesting cases.
1548 *
1549 * Note that must and mlen got initialized during setup.
1550 */
1551 static void
findmust(struct parse * p,struct re_guts * g)1552 findmust(struct parse *p, struct re_guts *g)
1553 {
1554 sop *scan;
1555 sop *start = NULL;
1556 sop *newstart = NULL;
1557 sopno newlen;
1558 sop s;
1559 char *cp;
1560 int offset;
1561 char buf[MB_LEN_MAX];
1562 size_t clen;
1563 mbstate_t mbs;
1564 locale_t loc = uselocale(NULL);
1565
1566 /* avoid making error situations worse */
1567 if (p->error != 0)
1568 return;
1569
1570 /*
1571 * It's not generally safe to do a ``char'' substring search on
1572 * multibyte character strings, but it's safe for at least
1573 * UTF-8 (see RFC 3629).
1574 */
1575 if (g->mb_cur_max > 1 &&
1576 strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1577 return;
1578
1579 /* find the longest OCHAR sequence in strip */
1580 newlen = 0;
1581 offset = 0;
1582 g->moffset = 0;
1583 scan = g->strip + 1;
1584 do {
1585 s = *scan++;
1586 switch (OP(s)) {
1587 case OCHAR: /* sequence member */
1588 if (newlen == 0) { /* new sequence */
1589 (void) memset(&mbs, 0, sizeof (mbs));
1590 newstart = scan - 1;
1591 }
1592 clen = wcrtomb(buf, OPND(s), &mbs);
1593 if (clen == (size_t)-1)
1594 goto toohard;
1595 newlen += clen;
1596 break;
1597 case OPLUS_: /* things that don't break one */
1598 case OLPAREN:
1599 case ORPAREN:
1600 break;
1601 case OQUEST_: /* things that must be skipped */
1602 case OCH_:
1603 offset = altoffset(scan, offset);
1604 scan--;
1605 do {
1606 scan += OPND(s);
1607 s = *scan;
1608 /* assert() interferes w debug printouts */
1609 if (OP(s) != (sop)O_QUEST &&
1610 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1611 g->iflags |= BAD;
1612 return;
1613 }
1614 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1615 /* FALLTHROUGH */
1616 case OBOW: /* things that break a sequence */
1617 case OEOW:
1618 case OBOL:
1619 case OEOL:
1620 case O_QUEST:
1621 case O_CH:
1622 case OEND:
1623 if (newlen > (sopno)g->mlen) { /* ends one */
1624 start = newstart;
1625 g->mlen = newlen;
1626 if (offset > -1) {
1627 g->moffset += offset;
1628 offset = newlen;
1629 } else
1630 g->moffset = offset;
1631 } else {
1632 if (offset > -1)
1633 offset += newlen;
1634 }
1635 newlen = 0;
1636 break;
1637 case OANY:
1638 if (newlen > (sopno)g->mlen) { /* ends one */
1639 start = newstart;
1640 g->mlen = newlen;
1641 if (offset > -1) {
1642 g->moffset += offset;
1643 offset = newlen;
1644 } else
1645 g->moffset = offset;
1646 } else {
1647 if (offset > -1)
1648 offset += newlen;
1649 }
1650 if (offset > -1)
1651 offset++;
1652 newlen = 0;
1653 break;
1654 case OANYOF: /* may or may not invalidate offset */
1655 /* First, everything as OANY */
1656 if (newlen > (sopno)g->mlen) { /* ends one */
1657 start = newstart;
1658 g->mlen = newlen;
1659 if (offset > -1) {
1660 g->moffset += offset;
1661 offset = newlen;
1662 } else
1663 g->moffset = offset;
1664 } else {
1665 if (offset > -1)
1666 offset += newlen;
1667 }
1668 if (offset > -1)
1669 offset++;
1670 newlen = 0;
1671 break;
1672 toohard:
1673 default:
1674 /*
1675 * Anything here makes it impossible or too hard
1676 * to calculate the offset -- so we give up;
1677 * save the last known good offset, in case the
1678 * must sequence doesn't occur later.
1679 */
1680 if (newlen > (sopno)g->mlen) { /* ends one */
1681 start = newstart;
1682 g->mlen = newlen;
1683 if (offset > -1)
1684 g->moffset += offset;
1685 else
1686 g->moffset = offset;
1687 }
1688 offset = -1;
1689 newlen = 0;
1690 break;
1691 }
1692 } while (OP(s) != OEND);
1693
1694 if (g->mlen == 0) { /* there isn't one */
1695 g->moffset = -1;
1696 return;
1697 }
1698
1699 /* turn it into a character string */
1700 g->must = malloc((size_t)g->mlen + 1);
1701 if (g->must == NULL) { /* argh; just forget it */
1702 g->mlen = 0;
1703 g->moffset = -1;
1704 return;
1705 }
1706 cp = g->must;
1707 scan = start;
1708 (void) memset(&mbs, 0, sizeof (mbs));
1709 while (cp < g->must + g->mlen) {
1710 while (OP(s = *scan++) != OCHAR)
1711 continue;
1712 clen = wcrtomb(cp, OPND(s), &mbs);
1713 assert(clen != (size_t)-1);
1714 cp += clen;
1715 }
1716 assert(cp == g->must + g->mlen);
1717 *cp++ = '\0'; /* just on general principles */
1718 }
1719
1720 /*
1721 * altoffset - choose biggest offset among multiple choices
1722 *
1723 * Compute, recursively if necessary, the largest offset among multiple
1724 * re paths.
1725 */
1726 static int
altoffset(sop * scan,int offset)1727 altoffset(sop *scan, int offset)
1728 {
1729 int largest;
1730 int try;
1731 sop s;
1732
1733 /* If we gave up already on offsets, return */
1734 if (offset == -1)
1735 return (-1);
1736
1737 largest = 0;
1738 try = 0;
1739 s = *scan++;
1740 while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
1741 switch (OP(s)) {
1742 case OOR1:
1743 if (try > largest)
1744 largest = try;
1745 try = 0;
1746 break;
1747 case OQUEST_:
1748 case OCH_:
1749 try = altoffset(scan, try);
1750 if (try == -1)
1751 return (-1);
1752 scan--;
1753 do {
1754 scan += OPND(s);
1755 s = *scan;
1756 if (OP(s) != (sop)O_QUEST &&
1757 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
1758 return (-1);
1759 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1760 /*
1761 * We must skip to the next position, or we'll
1762 * leave altoffset() too early.
1763 */
1764 scan++;
1765 break;
1766 case OANYOF:
1767 case OCHAR:
1768 case OANY:
1769 try++;
1770 /*FALLTHRU*/
1771 case OBOW:
1772 case OEOW:
1773 case OLPAREN:
1774 case ORPAREN:
1775 case OOR2:
1776 break;
1777 default:
1778 try = -1;
1779 break;
1780 }
1781 if (try == -1)
1782 return (-1);
1783 s = *scan++;
1784 }
1785
1786 if (try > largest)
1787 largest = try;
1788
1789 return (largest+offset);
1790 }
1791
1792 /*
1793 * computejumps - compute char jumps for BM scan
1794 *
1795 * This algorithm assumes g->must exists and is has size greater than
1796 * zero. It's based on the algorithm found on Computer Algorithms by
1797 * Sara Baase.
1798 *
1799 * A char jump is the number of characters one needs to jump based on
1800 * the value of the character from the text that was mismatched.
1801 */
1802 static void
computejumps(struct parse * p,struct re_guts * g)1803 computejumps(struct parse *p, struct re_guts *g)
1804 {
1805 int ch;
1806 int mindex;
1807
1808 /* Avoid making errors worse */
1809 if (p->error != 0)
1810 return;
1811
1812 g->charjump = (int *)malloc((NC_MAX + 1) * sizeof (int));
1813 if (g->charjump == NULL) /* Not a fatal error */
1814 return;
1815 /* Adjust for signed chars, if necessary */
1816 g->charjump = &g->charjump[-(CHAR_MIN)];
1817
1818 /*
1819 * If the character does not exist in the pattern, the jump
1820 * is equal to the number of characters in the pattern.
1821 */
1822 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1823 g->charjump[ch] = g->mlen;
1824
1825 /*
1826 * If the character does exist, compute the jump that would
1827 * take us to the last character in the pattern equal to it
1828 * (notice that we match right to left, so that last character
1829 * is the first one that would be matched).
1830 */
1831 for (mindex = 0; mindex < g->mlen; mindex++)
1832 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1833 }
1834
1835 /*
1836 * computematchjumps - compute match jumps for BM scan
1837 *
1838 * This algorithm assumes g->must exists and is has size greater than
1839 * zero. It's based on the algorithm found on Computer Algorithms by
1840 * Sara Baase.
1841 *
1842 * A match jump is the number of characters one needs to advance based
1843 * on the already-matched suffix.
1844 * Notice that all values here are minus (g->mlen-1), because of the way
1845 * the search algorithm works.
1846 */
1847 static void
computematchjumps(struct parse * p,struct re_guts * g)1848 computematchjumps(struct parse *p, struct re_guts *g)
1849 {
1850 int mindex; /* General "must" iterator */
1851 int suffix; /* Keeps track of matching suffix */
1852 int ssuffix; /* Keeps track of suffixes' suffix */
1853 int *pmatches;
1854 /*
1855 * pmatches[k] points to the next i
1856 * such that i+1...mlen is a substring
1857 * of k+1...k+mlen-i-1
1858 */
1859
1860 /* Avoid making errors worse */
1861 if (p->error != 0)
1862 return;
1863
1864 pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
1865 if (pmatches == NULL) {
1866 g->matchjump = NULL;
1867 return;
1868 }
1869
1870 g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
1871 if (g->matchjump == NULL) { /* Not a fatal error */
1872 free(pmatches);
1873 return;
1874 }
1875
1876 /* Set maximum possible jump for each character in the pattern */
1877 for (mindex = 0; mindex < g->mlen; mindex++)
1878 g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1879
1880 /* Compute pmatches[] */
1881 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1882 mindex--, suffix--) {
1883 pmatches[mindex] = suffix;
1884
1885 /*
1886 * If a mismatch is found, interrupting the substring,
1887 * compute the matchjump for that position. If no
1888 * mismatch is found, then a text substring mismatched
1889 * against the suffix will also mismatch against the
1890 * substring.
1891 */
1892 while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
1893 g->matchjump[suffix] = MIN(g->matchjump[suffix],
1894 g->mlen - mindex - 1);
1895 suffix = pmatches[suffix];
1896 }
1897 }
1898
1899 /*
1900 * Compute the matchjump up to the last substring found to jump
1901 * to the beginning of the largest must pattern prefix matching
1902 * it's own suffix.
1903 */
1904 for (mindex = 0; mindex <= suffix; mindex++)
1905 g->matchjump[mindex] = MIN(g->matchjump[mindex],
1906 g->mlen + suffix - mindex);
1907
1908 ssuffix = pmatches[suffix];
1909 while (suffix < g->mlen) {
1910 while (suffix <= ssuffix && suffix < g->mlen) {
1911 g->matchjump[suffix] = MIN(g->matchjump[suffix],
1912 g->mlen + ssuffix - suffix);
1913 suffix++;
1914 }
1915 if (suffix < g->mlen)
1916 ssuffix = pmatches[ssuffix];
1917 }
1918
1919 free(pmatches);
1920 }
1921
1922 /*
1923 * pluscount - count + nesting
1924 */
1925 static sopno /* nesting depth */
pluscount(struct parse * p,struct re_guts * g)1926 pluscount(struct parse *p, struct re_guts *g)
1927 {
1928 sop *scan;
1929 sop s;
1930 sopno plusnest = 0;
1931 sopno maxnest = 0;
1932
1933 if (p->error != 0)
1934 return (0); /* there may not be an OEND */
1935
1936 scan = g->strip + 1;
1937 do {
1938 s = *scan++;
1939 switch (OP(s)) {
1940 case OPLUS_:
1941 plusnest++;
1942 break;
1943 case O_PLUS:
1944 if (plusnest > maxnest)
1945 maxnest = plusnest;
1946 plusnest--;
1947 break;
1948 }
1949 } while (OP(s) != OEND);
1950 if (plusnest != 0)
1951 g->iflags |= BAD;
1952 return (maxnest);
1953 }
1954