xref: /illumos-gate/usr/src/cmd/expr/compile.c (revision 2e837a72011f54762249b6612c2a64f171efcd43)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 1995-2003 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  */
27 
28 /*
29  * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g)
30  *	using regcomp(3c), regexec(3c) interfaces. This is an XCU4
31  *	porting aid. switches out to libgen compile/step if collation
32  *	table not present.
33  *
34  *	Goal is to work with vi and sed/ed.
35  *	Returns expbuf in dhl format (encoding of first two bytes).
36  *	Note also that this is profoundly single threaded.  You
37  *	cannot call compile twice with two separate search strings
38  *	because the second call will wipe out the earlier stored string.
39  *	This must be fixed, plus a general cleanup should be performed
40  *	if this is to be integrated into libc.
41  *
42  */
43 
44 #include <stdio.h>
45 #include <widec.h>
46 #include <sys/types.h>
47 #include <regex.h>
48 #include <locale.h>
49 #include <stdlib.h>
50 #include <locale.h>
51 #include <string.h>
52 #include <unistd.h>
53 #include <regexpr.h>
54 
55 /*
56  * psuedo compile/step/advance global variables
57  */
58 extern int nbra;
59 extern char *locs;		/* for stopping execess recursion */
60 extern char *loc1;		/* 1st character which matched RE */
61 extern char *loc2;		/* char after lst char in matched RE */
62 extern char *braslist[];	/* start of nbra subexp  */
63 extern char *braelist[];	/* end of nbra subexp    */
64 extern int regerrno;
65 extern int reglength;
66 
67 int regcomp_flags;		/* interface to specify cflags for regcomp */
68 
69 void regex_comp_free(void *a);
70 static int dhl_step(const char *str, const char *ep);
71 static int dhl_advance(const char *str, const char *ep);
72 static int map_errnos(int);		/* Convert regcomp error */
73 static int dhl_doit(const char *, const regex_t *, const int flags);
74 static char *dhl_compile(const char *instr, char *ep, char *endbuf);
75 
76 /*
77  * # of sub re's: NOTE: For now limit on bra list defined here
78  * but fix is to add maxbra define to to regex.h
79  * One problem is that a bigger number is a performance hit since
80  * regexec() has a slow initialization loop that goes around SEPSIZE times
81  */
82 #define	SEPSIZE 20
83 static regmatch_t rm[SEPSIZE];		/* ptr to list of RE matches */
84 
85 /*
86  * Structure to contain dl encoded first two bytes for vi, plus hold two
87  * regex structures, one for advance and one for step.
88  */
89 static struct regex_comp {
90 	char	r_head[2];		/* Header for DL encoding for vi */
91 	regex_t r_stp;			/* For use by step */
92 	regex_t r_adv;			/* For use by advance */
93 } reg_comp;
94 
95 /*
96  * global value for the size of a regex_comp structure:
97  */
98 size_t regexc_size = sizeof (reg_comp);
99 
100 
101 char *
102 compile(const char *instr, char *expbuf, char *endbuf)
103 {
104 	return (dhl_compile(instr, expbuf, endbuf));
105 }
106 
107 int
108 step(const char *instr, const char *expbuf)
109 {
110 	return (dhl_step(instr, expbuf));
111 }
112 
113 int
114 advance(const char *instr, const char *expbuf)
115 {
116 	return (dhl_advance(instr, expbuf));
117 }
118 
119 
120 /*
121  * the compile and step routines here simulate the old libgen routines of
122  * compile/step Re: regexpr(3G). in order to do this, we must assume
123  * that expbuf[] consists of the following format:
124  *	1) the first two bytes consist of a special encoding - see below.
125  *	2) the next part is a regex_t used by regexec()/regcomp() for step
126  *	3) the final part is a regex_t used by regexec()/regcomp() for advance
127  *
128  * the special encoding of the first two bytes is referenced throughout
129  * vi. apparently expbuf[0] is set to:
130  *	= 0 upon initialization
131  *	= 1 if the first char of the RE is a ^
132  *	= 0 if the first char of the RE isn't a ^
133  * and expbuf[1-35+]	= bitmap of the type of RE chars in the expression.
134  * this is apparently 0 if there's no RE.
135  * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero
136  * if there's at least 1 RE in the string.
137  * I say "apparently" as the code to compile()/step() is poorly written.
138  */
139 static char *
140 dhl_compile(const char *instr,	/* the regular expression		*/
141     char *expbuf,		/* where the compiled RE gets placed	*/
142     char *endbuf)		/* ending addr of expbuf		*/
143 {
144 	int rv;
145 	int alloc = 0;
146 	char adv_instr[4096];	/* PLENTY big temp buffer */
147 	char *instrp;		/* PLENTY big temp buffer */
148 
149 	if (*instr == '\0') {
150 		regerrno = 41;
151 		return (NULL);
152 	}
153 
154 	/*
155 	 * Check values of expbuf and endbuf
156 	 */
157 	if (expbuf == NULL) {
158 		if ((expbuf = malloc(regexc_size)) == NULL) {
159 			regerrno = 50;
160 			return (NULL);
161 		}
162 		memset(&reg_comp, 0, regexc_size);
163 		alloc = 1;
164 		endbuf = expbuf + regexc_size;
165 	} else {		/* Check if enough memory was allocated */
166 		if (expbuf + regexc_size > endbuf) {
167 			regerrno = 50;
168 			return (NULL);
169 		}
170 		memcpy(&reg_comp, expbuf, regexc_size);
171 	}
172 
173 	/*
174 	 * Clear global flags
175 	 */
176 	nbra = 0;
177 	regerrno = 0;
178 
179 	/*
180 	 * Free any data being held for previous search strings
181 	 */
182 	regex_comp_free(&reg_comp);
183 
184 	/*
185 	 * We call regcomp twice, once to get a regex_t for use by step()
186 	 * and then again with for use by advance()
187 	 */
188 	if ((rv = regcomp(&reg_comp.r_stp, instr, regcomp_flags)) != 0) {
189 		regerrno = map_errnos(rv);	/* Convert regcomp error */
190 		goto out;
191 	}
192 	/*
193 	 * To support advance, which assumes an implicit ^ to match at start
194 	 * of line we prepend a ^ to the pattern by copying to a temp buffer
195 	 */
196 
197 	if (instr[0] == '^')
198 		instrp = (char *)instr; /* String already has leading ^ */
199 	else {
200 		adv_instr[0] = '^';
201 		strncpy(&adv_instr[1], instr, 2048);
202 		instrp = adv_instr;
203 	}
204 
205 	if ((rv = regcomp(&reg_comp.r_adv, instrp, regcomp_flags)) != 0) {
206 		regerrno = map_errnos(rv);	/* Convert regcomp error */
207 		goto out;
208 	}
209 
210 	/*
211 	 * update global variables
212 	 */
213 	nbra = (int)reg_comp.r_adv.re_nsub > 0 ?
214 	    (int)reg_comp.r_adv.re_nsub : 0;
215 	regerrno = 0;
216 
217 	/*
218 	 * Set the header flags for use by vi
219 	 */
220 	if (instr[0] == '^')		/* if beginning of string,	*/
221 		reg_comp.r_head[0] = 1;	/* set special flag		*/
222 	else
223 		reg_comp.r_head[0] = 0;	/* clear special flag		*/
224 	/*
225 	 * note that for a single BRE, nbra will be 0 here.
226 	 * we're guaranteed that, at this point, a RE has been found.
227 	 */
228 	reg_comp.r_head[1] = 1;	/* set special flag		*/
229 	/*
230 	 * Copy our reg_comp structure to expbuf
231 	 */
232 	(void) memcpy(expbuf, (char *)&reg_comp, regexc_size);
233 
234 out:
235 	/*
236 	 * Return code from libgen regcomp with mods.  Note weird return
237 	 * value - if space is malloc'd return pointer to start of space,
238 	 * if user provided their own space, return pointer to 1+last byte
239 	 * of that space.
240 	 */
241 	if (regerrno != 0) {
242 		if (alloc)
243 			free(expbuf);
244 		return (NULL);
245 	}
246 	reglength = regexc_size;
247 
248 	if (alloc)
249 		return (expbuf);
250 	else
251 		return (expbuf + regexc_size);
252 }
253 
254 
255 /*
256  * dhl_step: step through a string until a RE match is found, or end of str
257  */
258 static int
259 dhl_step(const char *str,	/* characters to be checked for a match	*/
260     const char *ep)		/* compiled RE from dhl_compile()	*/
261 {
262 	/*
263 	 * Check if we're passed a null ep
264 	 */
265 	if (ep == NULL) {
266 		regerrno = 41;	/* No remembered search string error */
267 		return (0);
268 	}
269 	/*
270 	 * Call common routine with r_stp (step) structure
271 	 */
272 	return (dhl_doit(str, &(((struct regex_comp *)ep)->r_stp),
273 	    ((locs != NULL) ? REG_NOTBOL : 0)));
274 }
275 
276 /*
277  * dhl_advance: implement advance
278  */
279 static int
280 dhl_advance(const char *str,	/* characters to be checked for a match	*/
281     const char *ep)		/* compiled RE from dhl_compile()	*/
282 {
283 	int rv;
284 	/*
285 	 * Check if we're passed a null ep
286 	 */
287 	if (ep == NULL) {
288 		regerrno = 41;	/* No remembered search string error */
289 		return (0);
290 	}
291 	/*
292 	 * Call common routine with r_adv (advance) structure
293 	 */
294 	rv = dhl_doit(str, &(((struct regex_comp *)ep)->r_adv), 0);
295 	loc1 = NULL;		/* Clear it per the compile man page */
296 	return (rv);
297 }
298 
299 /*
300  * dhl_doit - common code for step and advance
301  */
302 static int
303 dhl_doit(const char *str,	/* characters to be checked for a match	*/
304     const regex_t *rep,
305     const int flags)		/* flags to be passed to regexec directly */
306 {
307 	int rv;
308 	int i;
309 	regmatch_t *prm;	/* ptr to current regmatch_t		*/
310 
311 	/*
312 	 * Check if we're passed a null regex_t
313 	 */
314 	if (rep == NULL) {
315 		regerrno = 41;	/* No remembered search string error */
316 		return (0);
317 	}
318 
319 	regerrno = 0;
320 	prm = &rm[0];
321 
322 	if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) {
323 		if (rv == REG_NOMATCH)
324 			return (0);
325 		regerrno = map_errnos(rv);
326 		return (0);
327 	}
328 
329 	loc1 = (char *)str + prm->rm_so;
330 	loc2 = (char *)str + prm->rm_eo;
331 
332 	/*
333 	 * Now we need to fill up the bra lists with all of the sub re's
334 	 * Note we subtract nsub -1, and preincrement prm.
335 	 */
336 	for (i = 0; i <= rep->re_nsub; i++) {
337 		prm++;		/* XXX inc past first subexp */
338 		braslist[i] = (char *)str + prm->rm_so;
339 		braelist[i] = (char *)str + prm->rm_eo;
340 		if (i >= SEPSIZE) {
341 			regerrno = 50;	/* regex overflow */
342 			return (0);
343 		}
344 	}
345 
346 	/*
347 	 * Inverse logic, a zero from regexec - success, is a 1
348 	 * from advance/step.
349 	 */
350 
351 	return (rv == 0);
352 }
353 
354 
355 /*
356  *	regerrno to compile/step error mapping:
357  *	This is really a big compromise.  Some errors don't map at all
358  *	like regcomp error 15 is generated by both compile() error types
359  *	44 & 46.  So which one should we map to?
360  *	Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions
361  *	To do your errors right use xregerr() to get the regcomp error
362  *	string and print that.
363  *
364  * |    regcomp/regexec              |  Compile/step/advance                |
365  * +---------------------------------+--------------------------------------+
366  * 0 REG_OK	  Pattern matched	1  - Pattern matched
367  * 1 REG_NOMATCH  No match		0  - Pattern didn't match
368  * 2 REG_ECOLLATE Bad collation elmnt.	67 - Returned by compile on mbtowc err
369  * 3 REG_EESCAPE  trailing \ in patrn	45 - } expected after \.
370  * 4 REG_ENEWLINE \n before end pattrn	36 - Illegal or missing delimiter.
371  * 5 REG_ENSUB    Over 9 \( \) pairs	43 - Too many \(
372  * 6 REG_ESUBREG  Bad number in \[0-9]  25 - ``\digit'' out of range.
373  * 7 REG_EBRACK   [ ] inbalance		49 - [ ] imbalance.
374  * 8 REG_EPAREN   ( ) inbalance         42 - \(~\) imbalance.
375  * 9 REG_EBRACE   \{ \} inbalance       45 - } expected after \.
376  * 10 REG_ERANGE  bad range endpoint	11 - Range endpoint too large.
377  * 11 REG_ESPACE  no memory for pattern 50 - Regular expression overflow.
378  * 12 REG_BADRPT  invalid repetition	36 - Illegal or missing delimiter.
379  * 13 REG_ECTYPE  invalid char-class    67 - illegal byte sequence
380  * 14 REG_BADPAT  syntax error		50 - Regular expression overflow.
381  * 15 REG_BADBR   \{ \} contents bad	46 - First number exceeds 2nd in \{~\}
382  * 16 REG_EFATAL  internal error	50 - Regular expression overflow.
383  * 17 REG_ECHAR   bad mulitbyte char	67 - illegal byte sequence
384  * 18 REG_STACK   stack overflow	50 - Regular expression overflow.
385  * 19 REG_ENOSYS  function not supported 50- Regular expression overflow.
386  *
387  *	For reference here's the compile/step errno's. We don't generate
388  *	41 here - it's done earlier, nor 44 since we can't tell if from 46.
389  *
390  *	11 - Range endpoint too large.
391  *	16 - Bad number.
392  *	25 - ``\digit'' out of range.
393  *	36 - Illegal or missing delimiter.
394  *	41 - No remembered search string.
395  *	42 - \(~\) imbalance.
396  *	43 - Too many \(.
397  *	44 - More than 2 numbers given in "\{~\}"
398  *	45 - } expected after \.
399  *	46 - First number exceeds 2nd in "\{~\}"
400  *	49 - [ ] imbalance.
401  *	50 - Regular expression overflow.
402  */
403 
404 static int
405 map_errnos(int Errno)
406 {
407 	switch (Errno) {
408 	case REG_ECOLLATE:
409 		regerrno = 67;
410 		break;
411 	case REG_EESCAPE:
412 		regerrno = 45;
413 		break;
414 	case REG_ENEWLINE:
415 		regerrno = 36;
416 		break;
417 	case REG_ENSUB:
418 		regerrno = 43;
419 		break;
420 	case REG_ESUBREG:
421 		regerrno = 25;
422 		break;
423 	case REG_EBRACK:
424 		regerrno = 49;
425 		break;
426 	case REG_EPAREN:
427 		regerrno = 42;
428 		break;
429 	case REG_EBRACE:
430 		regerrno = 45;
431 		break;
432 	case REG_ERANGE:
433 		regerrno = 11;
434 		break;
435 	case REG_ESPACE:
436 		regerrno = 50;
437 		break;
438 	case REG_BADRPT:
439 		regerrno = 36;
440 		break;
441 	case REG_ECTYPE:
442 		regerrno = 67;
443 		break;
444 	case REG_BADPAT:
445 		regerrno = 50;
446 		break;
447 	case REG_BADBR:
448 		regerrno = 46;
449 		break;
450 	case REG_EFATAL:
451 		regerrno = 50;
452 		break;
453 	case REG_ECHAR:
454 		regerrno = 67;
455 		break;
456 	case REG_STACK:
457 		regerrno = 50;
458 		break;
459 	case REG_ENOSYS:
460 		regerrno = 50;
461 		break;
462 	default:
463 		regerrno = 50;
464 		break;
465 	}
466 	return (regerrno);
467 }
468 
469 /*
470  *  This is a routine to clean up the subtle substructure of the struct
471  *  regex_comp type for use by clients of this module.  Since the struct
472  *  type is private, we use a generic interface, and trust the
473  *  application to be damn sure that this operation is valid for the
474  *  named memory.
475  */
476 
477 void
478 regex_comp_free(void *a)
479 {
480 	/*
481 	 * Free any data being held for previous search strings
482 	 */
483 
484 	if (a == NULL) {
485 		return;
486 	}
487 
488 	regfree(&((struct regex_comp *)a)->r_stp);
489 	regfree(&((struct regex_comp *)a)->r_adv);
490 }
491