xref: /illumos-gate/usr/src/cmd/expr/compile.c (revision e8921a52c53ee69f7b65f054d9b2e886139daa59)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 1995-2003 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  */
27 
28 /*
29  * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g)
30  *	using regcomp(3c), regexec(3c) interfaces. This is an XCU4
31  *	porting aid. switches out to libgen compile/step if collation
32  *	table not present.
33  *
34  *	Goal is to work with vi and sed/ed.
35  * 	Returns expbuf in dhl format (encoding of first two bytes).
36  * 	Note also that this is profoundly single threaded.  You
37  *	cannot call compile twice with two separate search strings
38  *	because the second call will wipe out the earlier stored string.
39  *	This must be fixed, plus a general cleanup should be performed
40  *	if this is to be integrated into libc.
41  *
42  */
43 
44 #pragma ident	"%Z%%M%	%I%	%E% SMI"
45 
46 #include <stdio.h>
47 #include <widec.h>
48 #include <sys/types.h>
49 #include <regex.h>
50 #include <locale.h>
51 #include <stdlib.h>
52 #include <locale.h>
53 #include <string.h>
54 #include <unistd.h>
55 #include <regexpr.h>
56 
57 /*
58  * psuedo compile/step/advance global variables
59  */
60 extern int nbra;
61 extern char *locs; 		/* for stopping execess recursion */
62 extern char *loc1;  		/* 1st character which matched RE */
63 extern char *loc2; 		/* char after lst char in matched RE */
64 extern char *braslist[]; 	/* start of nbra subexp  */
65 extern char *braelist[]; 	/* end of nbra subexp    */
66 extern int regerrno;
67 extern int reglength;
68 
69 int regcomp_flags;		/* interface to specify cflags for regcomp */
70 
71 void regex_comp_free(void *a);
72 static int dhl_step(const char *str, const char *ep);
73 static int dhl_advance(const char *str, const char *ep);
74 static int map_errnos(int);		/* Convert regcomp error */
75 static int dhl_doit(const char *, const regex_t *, const int flags);
76 static char * dhl_compile(const char *instr, char *ep, char *endbuf);
77 
78 /*
79  * # of sub re's: NOTE: For now limit on bra list defined here
80  * but fix is to add maxbra define to to regex.h
81  * One problem is that a bigger number is a performance hit since
82  * regexec() has a slow initialization loop that goes around SEPSIZE times
83  */
84 #define	SEPSIZE 20
85 static regmatch_t rm[SEPSIZE];		/* ptr to list of RE matches */
86 
87 /*
88  * Structure to contain dl encoded first two bytes for vi, plus hold two
89  * regex structures, one for advance and one for step.
90  */
91 static struct regex_comp {
92 	char 	r_head[2];		/* Header for DL encoding for vi */
93 	regex_t r_stp;			/* For use by step */
94 	regex_t r_adv;			/* For use by advance */
95 } reg_comp;
96 
97 /*
98  * global value for the size of a regex_comp structure:
99  */
100 size_t regexc_size = sizeof (reg_comp);
101 
102 
103 char *
104 compile(const char *instr, char *expbuf, char *endbuf)
105 {
106 	return (dhl_compile(instr, expbuf, endbuf));
107 }
108 
109 int
110 step(const char *instr, const char *expbuf)
111 {
112 	return (dhl_step(instr, expbuf));
113 }
114 
115 int
116 advance(const char *instr, const char *expbuf)
117 {
118 	return (dhl_advance(instr, expbuf));
119 }
120 
121 
122 /*
123  * the compile and step routines here simulate the old libgen routines of
124  * compile/step Re: regexpr(3G). in order to do this, we must assume
125  * that expbuf[] consists of the following format:
126  *	1) the first two bytes consist of a special encoding - see below.
127  *	2) the next part is a regex_t used by regexec()/regcomp() for step
128  *	3) the final part is a regex_t used by regexec()/regcomp() for advance
129  *
130  * the special encoding of the first two bytes is referenced throughout
131  * vi. apparently expbuf[0] is set to:
132  *	= 0 upon initialization
133  *	= 1 if the first char of the RE is a ^
134  *	= 0 if the first char of the RE isn't a ^
135  * and expbuf[1-35+]	= bitmap of the type of RE chars in the expression.
136  * this is apparently 0 if there's no RE.
137  * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero
138  * if there's at least 1 RE in the string.
139  * I say "apparently" as the code to compile()/step() is poorly written.
140  */
141 static char *
142 dhl_compile(instr, expbuf, endbuf)
143 const char *instr;		/* the regular expression		*/
144 char *expbuf;			/* where the compiled RE gets placed	*/
145 char *endbuf;			/* ending addr of expbuf		*/
146 {
147 	int rv;
148 	int alloc = 0;
149 	char adv_instr[4096];	/* PLENTY big temp buffer */
150 	char *instrp;		/* PLENTY big temp buffer */
151 
152 	if (*instr == (char) NULL) {
153 		regerrno = 41;
154 		return (NULL);
155 	}
156 
157 	/*
158 	 * Check values of expbuf and endbuf
159 	 */
160 	if (expbuf == NULL) {
161 		if ((expbuf = malloc(regexc_size)) == NULL) {
162 			regerrno = 50;
163 			return (NULL);
164 		}
165 		memset(&reg_comp, 0, regexc_size);
166 		alloc = 1;
167 		endbuf = expbuf + regexc_size;
168 	} else {		/* Check if enough memory was allocated */
169 		if (expbuf + regexc_size > endbuf) {
170 			regerrno = 50;
171 			return (NULL);
172 		}
173 		memcpy(&reg_comp, expbuf, regexc_size);
174 	}
175 
176 	/*
177 	 * Clear global flags
178 	 */
179 	nbra = 0;
180 	regerrno = 0;
181 
182 	/*
183 	 * Free any data being held for previous search strings
184 	 */
185 	regex_comp_free(&reg_comp);
186 
187 	/*
188 	 * We call regcomp twice, once to get a regex_t for use by step()
189 	 * and then again with for use by advance()
190 	 */
191 	if ((rv = regcomp(&reg_comp.r_stp, instr, regcomp_flags)) != 0) {
192 		regerrno = map_errnos(rv);	/* Convert regcomp error */
193 		goto out;
194 	}
195 	/*
196 	 * To support advance, which assumes an implicit ^ to match at start
197 	 * of line we prepend a ^ to the pattern by copying to a temp buffer
198 	 */
199 
200 	if (instr[0] == '^')
201 		instrp = (char *) instr; /* String already has leading ^ */
202 	else {
203 		adv_instr[0] = '^';
204 		strncpy(&adv_instr[1], instr, 2048);
205 		instrp = adv_instr;
206 	}
207 
208 	if ((rv = regcomp(&reg_comp.r_adv, instrp, regcomp_flags)) != 0) {
209 		regerrno = map_errnos(rv);	/* Convert regcomp error */
210 		goto out;
211 	}
212 
213 	/*
214 	 * update global variables
215 	 */
216 	nbra = (int) reg_comp.r_adv.re_nsub > 0 ?
217 	    (int) reg_comp.r_adv.re_nsub : 0;
218 	regerrno = 0;
219 
220 	/*
221 	 * Set the header flags for use by vi
222 	 */
223 	if (instr[0] == '^') 		/* if beginning of string,	*/
224 		reg_comp.r_head[0] = 1;	/* set special flag		*/
225 	else
226 		reg_comp.r_head[0] = 0;	/* clear special flag		*/
227 	/*
228 	 * note that for a single BRE, nbra will be 0 here.
229 	 * we're guaranteed that, at this point, a RE has been found.
230 	 */
231 	reg_comp.r_head[1] = 1;	/* set special flag		*/
232 	/*
233 	 * Copy our reg_comp structure to expbuf
234 	 */
235 	(void) memcpy(expbuf, (char *) &reg_comp, regexc_size);
236 
237 out:
238 	/*
239 	 * Return code from libgen regcomp with mods.  Note weird return
240 	 * value - if space is malloc'd return pointer to start of space,
241 	 * if user provided their own space, return pointer to 1+last byte
242 	 * of that space.
243 	 */
244 	if (regerrno != 0) {
245 		if (alloc)
246 			free(expbuf);
247 		return (NULL);
248 	}
249 	reglength = regexc_size;
250 
251 	if (alloc)
252 		return (expbuf);
253 	else
254 		return (expbuf + regexc_size);
255 }
256 
257 
258 /*
259  * dhl_step: step through a string until a RE match is found, or end of str
260  */
261 static int
262 dhl_step(str, ep)
263 const char *str;		/* characters to be checked for a match	*/
264 const char *ep;			/* compiled RE from dhl_compile()	*/
265 {
266 	/*
267 	 * Check if we're passed a null ep
268 	 */
269 	if (ep == NULL) {
270 		regerrno = 41;	/* No remembered search string error */
271 		return (0);
272 	}
273 	/*
274 	 * Call common routine with r_stp (step) structure
275 	 */
276 	return (dhl_doit(str, &(((struct regex_comp *) ep)->r_stp),
277 	    ((locs != NULL) ? REG_NOTBOL : 0)));
278 }
279 
280 /*
281  * dhl_advance: implement advance
282  */
283 static int
284 dhl_advance(str, ep)
285 const char *str;		/* characters to be checked for a match	*/
286 const char *ep;			/* compiled RE from dhl_compile()	*/
287 {
288 	int rv;
289 	/*
290 	 * Check if we're passed a null ep
291 	 */
292 	if (ep == NULL) {
293 		regerrno = 41;	/* No remembered search string error */
294 		return (0);
295 	}
296 	/*
297 	 * Call common routine with r_adv (advance) structure
298 	 */
299 	rv = dhl_doit(str, &(((struct regex_comp *) ep)->r_adv), 0);
300 	loc1 = NULL;		/* Clear it per the compile man page */
301 	return (rv);
302 }
303 
304 /*
305  * dhl_doit - common code for step and advance
306  */
307 static int
308 dhl_doit(str, rep, flags)
309 const char *str;		/* characters to be checked for a match	*/
310 const regex_t *rep;
311 const int flags;		/* flags to be passed to regexec directly */
312 {
313 	int rv;
314 	int i;
315 	regmatch_t *prm;	/* ptr to current regmatch_t		*/
316 
317 	/*
318 	 * Check if we're passed a null regex_t
319 	 */
320 	if (rep == NULL) {
321 		regerrno = 41;	/* No remembered search string error */
322 		return (0);
323 	}
324 
325 	regerrno = 0;
326 	prm = &rm[0];
327 
328 	if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) {
329 		if (rv == REG_NOMATCH)
330 			return (0);
331 		regerrno = map_errnos(rv);
332 		return (0);
333 	}
334 
335 	loc1 = (char *)str + prm->rm_so;
336 	loc2 = (char *)str + prm->rm_eo;
337 
338 	/*
339 	 * Now we need to fill up the bra lists with all of the sub re's
340 	 * Note we subtract nsub -1, and preincrement prm.
341 	 */
342 	for (i = 0; i <= rep->re_nsub; i++) {
343 		prm++;		/* XXX inc past first subexp */
344 		braslist[i] = (char *)str + prm->rm_so;
345 		braelist[i] = (char *)str + prm->rm_eo;
346 		if (i >= SEPSIZE) {
347 			regerrno = 50; 	/* regex overflow */
348 			return (0);
349 		}
350 	}
351 
352 	/*
353 	 * Inverse logic, a zero from regexec - success, is a 1
354 	 * from advance/step.
355 	 */
356 
357 	return (rv == 0);
358 }
359 
360 
361 /*
362  *	regerrno to compile/step error mapping:
363  *	This is really a big compromise.  Some errors don't map at all
364  *	like regcomp error 15 is generated by both compile() error types
365  *  	44 & 46.  So which one should we map to?
366  *	Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions
367  *	To do your errors right use xregerr() to get the regcomp error
368  *	string and print that.
369  *
370  * |	regcomp/regexec		     | 	Compile/step/advance		    |
371  * +---------------------------------+--------------------------------------+
372  * 0 REG_OK	  Pattern matched	1  - Pattern matched
373  * 1 REG_NOMATCH  No match		0  - Pattern didn't match
374  * 2 REG_ECOLLATE Bad collation elmnt.	67 - Returned by compile on mbtowc err
375  * 3 REG_EESCAPE  trailing \ in patrn	45 - } expected after \.
376  * 4 REG_ENEWLINE \n before end pattrn	36 - Illegal or missing delimiter.
377  * 5 REG_ENSUB	  Over 9 \( \) pairs 	43 - Too many \(
378  * 6 REG_ESUBREG  Bad number in \[0-9]  25 - ``\digit'' out of range.
379  * 7 REG_EBRACK   [ ] inbalance		49 - [ ] imbalance.
380  * 8 REG_EPAREN   ( ) inbalance         42 - \(~\) imbalance.
381  * 9 REG_EBRACE   \{ \} inbalance       45 - } expected after \.
382  * 10 REG_ERANGE  bad range endpoint	11 - Range endpoint too large.
383  * 11 REG_ESPACE  no memory for pattern 50 - Regular expression overflow.
384  * 12 REG_BADRPT  invalid repetition	36 - Illegal or missing delimiter.
385  * 13 REG_ECTYPE  invalid char-class    67 - illegal byte sequence
386  * 14 REG_BADPAT  syntax error		50 - Regular expression overflow.
387  * 15 REG_BADBR   \{ \} contents bad	46 - First number exceeds 2nd in \{~\}
388  * 16 REG_EFATAL  internal error	50 - Regular expression overflow.
389  * 17 REG_ECHAR   bad mulitbyte char	67 - illegal byte sequence
390  * 18 REG_STACK   stack overflow	50 - Regular expression overflow.
391  * 19 REG_ENOSYS  function not supported 50- Regular expression overflow.
392  *
393  *	For reference here's the compile/step errno's. We don't generate
394  *	41 here - it's done earlier, nor 44 since we can't tell if from 46.
395  *
396  *	11 - Range endpoint too large.
397  *	16 - Bad number.
398  *	25 - ``\digit'' out of range.
399  *	36 - Illegal or missing delimiter.
400  *	41 - No remembered search string.
401  *	42 - \(~\) imbalance.
402  *	43 - Too many \(.
403  *	44 - More than 2 numbers given in "\{~\}"
404  *	45 - } expected after \.
405  *	46 - First number exceeds 2nd in "\{~\}"
406  *	49 - [ ] imbalance.
407  *	50 - Regular expression overflow.
408  */
409 
410 static int
411 map_errnos(int Errno)
412 {
413 	switch (Errno) {
414 	case REG_ECOLLATE:
415 		regerrno = 67;
416 		break;
417 	case REG_EESCAPE:
418 		regerrno = 45;
419 		break;
420 	case REG_ENEWLINE:
421 		regerrno = 36;
422 		break;
423 	case REG_ENSUB:
424 		regerrno = 43;
425 		break;
426 	case REG_ESUBREG:
427 		regerrno = 25;
428 		break;
429 	case REG_EBRACK:
430 		regerrno = 49;
431 		break;
432 	case REG_EPAREN:
433 		regerrno = 42;
434 		break;
435 	case REG_EBRACE:
436 		regerrno = 45;
437 		break;
438 	case REG_ERANGE:
439 		regerrno = 11;
440 		break;
441 	case REG_ESPACE:
442 		regerrno = 50;
443 		break;
444 	case REG_BADRPT:
445 		regerrno = 36;
446 		break;
447 	case REG_ECTYPE:
448 		regerrno = 67;
449 		break;
450 	case REG_BADPAT:
451 		regerrno = 50;
452 		break;
453 	case REG_BADBR:
454 		regerrno = 46;
455 		break;
456 	case REG_EFATAL:
457 		regerrno = 50;
458 		break;
459 	case REG_ECHAR:
460 		regerrno = 67;
461 		break;
462 	case REG_STACK:
463 		regerrno = 50;
464 		break;
465 	case REG_ENOSYS:
466 		regerrno = 50;
467 		break;
468 	default:
469 		regerrno = 50;
470 		break;
471 	}
472 	return (regerrno);
473 }
474 
475 /*
476  *  This is a routine to clean up the subtle substructure of the struct
477  *  regex_comp type for use by clients of this module.  Since the struct
478  *  type is private, we use a generic interface, and trust the
479  *  application to be damn sure that this operation is valid for the
480  *  named memory.
481  */
482 
483 void
484 regex_comp_free(void * a)
485 {
486 	/*
487 	 * Free any data being held for previous search strings
488 	 */
489 
490 	if (((struct regex_comp *) a) == NULL) {
491 		return;
492 	}
493 
494 	regfree(&((struct regex_comp *)a)->r_stp);
495 	regfree(&((struct regex_comp *)a)->r_adv);
496 }
497