1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 1995-2003 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /*
29 * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g)
30 * using regcomp(3c), regexec(3c) interfaces. This is an XCU4
31 * porting aid. switches out to libgen compile/step if collation
32 * table not present.
33 *
34 * Goal is to work with vi and sed/ed.
35 * Returns expbuf in dhl format (encoding of first two bytes).
36 * Note also that this is profoundly single threaded. You
37 * cannot call compile twice with two separate search strings
38 * because the second call will wipe out the earlier stored string.
39 * This must be fixed, plus a general cleanup should be performed
40 * if this is to be integrated into libc.
41 *
42 */
43
44 #include <stdio.h>
45 #include <widec.h>
46 #include <sys/types.h>
47 #include <regex.h>
48 #include <locale.h>
49 #include <stdlib.h>
50 #include <locale.h>
51 #include <string.h>
52 #include <unistd.h>
53 #include <regexpr.h>
54
55 /*
56 * psuedo compile/step/advance global variables
57 */
58 extern int nbra;
59 extern char *locs; /* for stopping execess recursion */
60 extern char *loc1; /* 1st character which matched RE */
61 extern char *loc2; /* char after lst char in matched RE */
62 extern char *braslist[]; /* start of nbra subexp */
63 extern char *braelist[]; /* end of nbra subexp */
64 extern int regerrno;
65 extern int reglength;
66
67 int regcomp_flags; /* interface to specify cflags for regcomp */
68
69 void regex_comp_free(void *a);
70 static int dhl_step(const char *str, const char *ep);
71 static int dhl_advance(const char *str, const char *ep);
72 static int map_errnos(int); /* Convert regcomp error */
73 static int dhl_doit(const char *, const regex_t *, const int flags);
74 static char *dhl_compile(const char *instr, char *ep, char *endbuf);
75
76 /*
77 * # of sub re's: NOTE: For now limit on bra list defined here
78 * but fix is to add maxbra define to to regex.h
79 * One problem is that a bigger number is a performance hit since
80 * regexec() has a slow initialization loop that goes around SEPSIZE times
81 */
82 #define SEPSIZE 20
83 static regmatch_t rm[SEPSIZE]; /* ptr to list of RE matches */
84
85 /*
86 * Structure to contain dl encoded first two bytes for vi, plus hold two
87 * regex structures, one for advance and one for step.
88 */
89 static struct regex_comp {
90 char r_head[2]; /* Header for DL encoding for vi */
91 regex_t r_stp; /* For use by step */
92 regex_t r_adv; /* For use by advance */
93 } reg_comp;
94
95 /*
96 * global value for the size of a regex_comp structure:
97 */
98 size_t regexc_size = sizeof (reg_comp);
99
100
101 char *
compile(const char * instr,char * expbuf,char * endbuf)102 compile(const char *instr, char *expbuf, char *endbuf)
103 {
104 return (dhl_compile(instr, expbuf, endbuf));
105 }
106
107 int
step(const char * instr,const char * expbuf)108 step(const char *instr, const char *expbuf)
109 {
110 return (dhl_step(instr, expbuf));
111 }
112
113 int
advance(const char * instr,const char * expbuf)114 advance(const char *instr, const char *expbuf)
115 {
116 return (dhl_advance(instr, expbuf));
117 }
118
119
120 /*
121 * the compile and step routines here simulate the old libgen routines of
122 * compile/step Re: regexpr(3GEN). in order to do this, we must assume
123 * that expbuf[] consists of the following format:
124 * 1) the first two bytes consist of a special encoding - see below.
125 * 2) the next part is a regex_t used by regexec()/regcomp() for step
126 * 3) the final part is a regex_t used by regexec()/regcomp() for advance
127 *
128 * the special encoding of the first two bytes is referenced throughout
129 * vi. apparently expbuf[0] is set to:
130 * = 0 upon initialization
131 * = 1 if the first char of the RE is a ^
132 * = 0 if the first char of the RE isn't a ^
133 * and expbuf[1-35+] = bitmap of the type of RE chars in the expression.
134 * this is apparently 0 if there's no RE.
135 * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero
136 * if there's at least 1 RE in the string.
137 * I say "apparently" as the code to compile()/step() is poorly written.
138 */
139 static char *
dhl_compile(const char * instr,char * expbuf,char * endbuf)140 dhl_compile(const char *instr, /* the regular expression */
141 char *expbuf, /* where the compiled RE gets placed */
142 char *endbuf) /* ending addr of expbuf */
143 {
144 int rv;
145 int alloc = 0;
146 char adv_instr[4096]; /* PLENTY big temp buffer */
147 char *instrp; /* PLENTY big temp buffer */
148
149 if (*instr == '\0') {
150 regerrno = 41;
151 return (NULL);
152 }
153
154 /*
155 * Check values of expbuf and endbuf
156 */
157 if (expbuf == NULL) {
158 if ((expbuf = malloc(regexc_size)) == NULL) {
159 regerrno = 50;
160 return (NULL);
161 }
162 memset(®_comp, 0, regexc_size);
163 alloc = 1;
164 endbuf = expbuf + regexc_size;
165 } else { /* Check if enough memory was allocated */
166 if (expbuf + regexc_size > endbuf) {
167 regerrno = 50;
168 return (NULL);
169 }
170 memcpy(®_comp, expbuf, regexc_size);
171 }
172
173 /*
174 * Clear global flags
175 */
176 nbra = 0;
177 regerrno = 0;
178
179 /*
180 * Free any data being held for previous search strings
181 */
182 regex_comp_free(®_comp);
183
184 /*
185 * We call regcomp twice, once to get a regex_t for use by step()
186 * and then again with for use by advance()
187 */
188 if ((rv = regcomp(®_comp.r_stp, instr, regcomp_flags)) != 0) {
189 regerrno = map_errnos(rv); /* Convert regcomp error */
190 goto out;
191 }
192 /*
193 * To support advance, which assumes an implicit ^ to match at start
194 * of line we prepend a ^ to the pattern by copying to a temp buffer
195 */
196
197 if (instr[0] == '^')
198 instrp = (char *)instr; /* String already has leading ^ */
199 else {
200 adv_instr[0] = '^';
201 strncpy(&adv_instr[1], instr, 2048);
202 instrp = adv_instr;
203 }
204
205 if ((rv = regcomp(®_comp.r_adv, instrp, regcomp_flags)) != 0) {
206 regerrno = map_errnos(rv); /* Convert regcomp error */
207 goto out;
208 }
209
210 /*
211 * update global variables
212 */
213 nbra = (int)reg_comp.r_adv.re_nsub > 0 ?
214 (int)reg_comp.r_adv.re_nsub : 0;
215 regerrno = 0;
216
217 /*
218 * Set the header flags for use by vi
219 */
220 if (instr[0] == '^') /* if beginning of string, */
221 reg_comp.r_head[0] = 1; /* set special flag */
222 else
223 reg_comp.r_head[0] = 0; /* clear special flag */
224 /*
225 * note that for a single BRE, nbra will be 0 here.
226 * we're guaranteed that, at this point, a RE has been found.
227 */
228 reg_comp.r_head[1] = 1; /* set special flag */
229 /*
230 * Copy our reg_comp structure to expbuf
231 */
232 (void) memcpy(expbuf, (char *)®_comp, regexc_size);
233
234 out:
235 /*
236 * Return code from libgen regcomp with mods. Note weird return
237 * value - if space is malloc'd return pointer to start of space,
238 * if user provided their own space, return pointer to 1+last byte
239 * of that space.
240 */
241 if (regerrno != 0) {
242 if (alloc)
243 free(expbuf);
244 return (NULL);
245 }
246 reglength = regexc_size;
247
248 if (alloc)
249 return (expbuf);
250 else
251 return (expbuf + regexc_size);
252 }
253
254
255 /*
256 * dhl_step: step through a string until a RE match is found, or end of str
257 */
258 static int
dhl_step(const char * str,const char * ep)259 dhl_step(const char *str, /* characters to be checked for a match */
260 const char *ep) /* compiled RE from dhl_compile() */
261 {
262 /*
263 * Check if we're passed a null ep
264 */
265 if (ep == NULL) {
266 regerrno = 41; /* No remembered search string error */
267 return (0);
268 }
269 /*
270 * Call common routine with r_stp (step) structure
271 */
272 return (dhl_doit(str, &(((struct regex_comp *)ep)->r_stp),
273 ((locs != NULL) ? REG_NOTBOL : 0)));
274 }
275
276 /*
277 * dhl_advance: implement advance
278 */
279 static int
dhl_advance(const char * str,const char * ep)280 dhl_advance(const char *str, /* characters to be checked for a match */
281 const char *ep) /* compiled RE from dhl_compile() */
282 {
283 int rv;
284 /*
285 * Check if we're passed a null ep
286 */
287 if (ep == NULL) {
288 regerrno = 41; /* No remembered search string error */
289 return (0);
290 }
291 /*
292 * Call common routine with r_adv (advance) structure
293 */
294 rv = dhl_doit(str, &(((struct regex_comp *)ep)->r_adv), 0);
295 loc1 = NULL; /* Clear it per the compile man page */
296 return (rv);
297 }
298
299 /*
300 * dhl_doit - common code for step and advance
301 */
302 static int
dhl_doit(const char * str,const regex_t * rep,const int flags)303 dhl_doit(const char *str, /* characters to be checked for a match */
304 const regex_t *rep,
305 const int flags) /* flags to be passed to regexec directly */
306 {
307 int rv;
308 int i;
309 regmatch_t *prm; /* ptr to current regmatch_t */
310
311 /*
312 * Check if we're passed a null regex_t
313 */
314 if (rep == NULL) {
315 regerrno = 41; /* No remembered search string error */
316 return (0);
317 }
318
319 regerrno = 0;
320 prm = &rm[0];
321
322 if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) {
323 if (rv == REG_NOMATCH)
324 return (0);
325 regerrno = map_errnos(rv);
326 return (0);
327 }
328
329 loc1 = (char *)str + prm->rm_so;
330 loc2 = (char *)str + prm->rm_eo;
331
332 /*
333 * Now we need to fill up the bra lists with all of the sub re's
334 * Note we subtract nsub -1, and preincrement prm.
335 */
336 for (i = 0; i <= rep->re_nsub; i++) {
337 prm++; /* XXX inc past first subexp */
338 braslist[i] = (char *)str + prm->rm_so;
339 braelist[i] = (char *)str + prm->rm_eo;
340 if (i >= SEPSIZE) {
341 regerrno = 50; /* regex overflow */
342 return (0);
343 }
344 }
345
346 /*
347 * Inverse logic, a zero from regexec - success, is a 1
348 * from advance/step.
349 */
350
351 return (rv == 0);
352 }
353
354
355 /*
356 * regerrno to compile/step error mapping:
357 * This is really a big compromise. Some errors don't map at all
358 * like regcomp error 15 is generated by both compile() error types
359 * 44 & 46. So which one should we map to?
360 * Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions
361 * To do your errors right use xregerr() to get the regcomp error
362 * string and print that.
363 *
364 * | regcomp/regexec | Compile/step/advance |
365 * +---------------------------------+--------------------------------------+
366 * 0 REG_OK Pattern matched 1 - Pattern matched
367 * 1 REG_NOMATCH No match 0 - Pattern didn't match
368 * 2 REG_ECOLLATE Bad collation elmnt. 67 - Returned by compile on mbtowc err
369 * 3 REG_EESCAPE trailing \ in patrn 45 - } expected after \.
370 * 4 REG_ENEWLINE \n before end pattrn 36 - Illegal or missing delimiter.
371 * 5 REG_ENSUB Over 9 \( \) pairs 43 - Too many \(
372 * 6 REG_ESUBREG Bad number in \[0-9] 25 - ``\digit'' out of range.
373 * 7 REG_EBRACK [ ] inbalance 49 - [ ] imbalance.
374 * 8 REG_EPAREN ( ) inbalance 42 - \(~\) imbalance.
375 * 9 REG_EBRACE \{ \} inbalance 45 - } expected after \.
376 * 10 REG_ERANGE bad range endpoint 11 - Range endpoint too large.
377 * 11 REG_ESPACE no memory for pattern 50 - Regular expression overflow.
378 * 12 REG_BADRPT invalid repetition 36 - Illegal or missing delimiter.
379 * 13 REG_ECTYPE invalid char-class 67 - illegal byte sequence
380 * 14 REG_BADPAT syntax error 50 - Regular expression overflow.
381 * 15 REG_BADBR \{ \} contents bad 46 - First number exceeds 2nd in \{~\}
382 * 16 REG_EFATAL internal error 50 - Regular expression overflow.
383 * 17 REG_ECHAR bad mulitbyte char 67 - illegal byte sequence
384 * 18 REG_STACK stack overflow 50 - Regular expression overflow.
385 * 19 REG_ENOSYS function not supported 50- Regular expression overflow.
386 *
387 * For reference here's the compile/step errno's. We don't generate
388 * 41 here - it's done earlier, nor 44 since we can't tell if from 46.
389 *
390 * 11 - Range endpoint too large.
391 * 16 - Bad number.
392 * 25 - ``\digit'' out of range.
393 * 36 - Illegal or missing delimiter.
394 * 41 - No remembered search string.
395 * 42 - \(~\) imbalance.
396 * 43 - Too many \(.
397 * 44 - More than 2 numbers given in "\{~\}"
398 * 45 - } expected after \.
399 * 46 - First number exceeds 2nd in "\{~\}"
400 * 49 - [ ] imbalance.
401 * 50 - Regular expression overflow.
402 */
403
404 static int
map_errnos(int Errno)405 map_errnos(int Errno)
406 {
407 switch (Errno) {
408 case REG_ECOLLATE:
409 regerrno = 67;
410 break;
411 case REG_EESCAPE:
412 regerrno = 45;
413 break;
414 case REG_ENEWLINE:
415 regerrno = 36;
416 break;
417 case REG_ENSUB:
418 regerrno = 43;
419 break;
420 case REG_ESUBREG:
421 regerrno = 25;
422 break;
423 case REG_EBRACK:
424 regerrno = 49;
425 break;
426 case REG_EPAREN:
427 regerrno = 42;
428 break;
429 case REG_EBRACE:
430 regerrno = 45;
431 break;
432 case REG_ERANGE:
433 regerrno = 11;
434 break;
435 case REG_ESPACE:
436 regerrno = 50;
437 break;
438 case REG_BADRPT:
439 regerrno = 36;
440 break;
441 case REG_ECTYPE:
442 regerrno = 67;
443 break;
444 case REG_BADPAT:
445 regerrno = 50;
446 break;
447 case REG_BADBR:
448 regerrno = 46;
449 break;
450 case REG_EFATAL:
451 regerrno = 50;
452 break;
453 case REG_ECHAR:
454 regerrno = 67;
455 break;
456 case REG_STACK:
457 regerrno = 50;
458 break;
459 case REG_ENOSYS:
460 regerrno = 50;
461 break;
462 default:
463 regerrno = 50;
464 break;
465 }
466 return (regerrno);
467 }
468
469 /*
470 * This is a routine to clean up the subtle substructure of the struct
471 * regex_comp type for use by clients of this module. Since the struct
472 * type is private, we use a generic interface, and trust the
473 * application to be damn sure that this operation is valid for the
474 * named memory.
475 */
476
477 void
regex_comp_free(void * a)478 regex_comp_free(void *a)
479 {
480 /*
481 * Free any data being held for previous search strings
482 */
483
484 if (a == NULL) {
485 return;
486 }
487
488 regfree(&((struct regex_comp *)a)->r_stp);
489 regfree(&((struct regex_comp *)a)->r_adv);
490 }
491