1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 1995-2003 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g)
29 * using regcomp(3c), regexec(3c) interfaces. This is an XCU4
30 * porting aid. switches out to libgen compile/step if collation
31 * table not present.
32 *
33 * Goal is to work with vi and sed/ed.
34 * Returns expbuf in dhl format (encoding of first two bytes).
35 * Note also that this is profoundly single threaded. You
36 * cannot call compile twice with two separate search strings
37 * because the second call will wipe out the earlier stored string.
38 * This must be fixed, plus a general cleanup should be performed
39 * if this is to be integrated into libc.
40 *
41 */
42
43 #pragma ident "%Z%%M% %I% %E% SMI"
44
45 #include <stdio.h>
46 #include <widec.h>
47 #include <sys/types.h>
48 #include <regex.h>
49 #include <locale.h>
50 #include <stdlib.h>
51 #include <locale.h>
52 #include <string.h>
53 #include <unistd.h>
54 #include <regexpr.h>
55
56 /*
57 * psuedo compile/step/advance global variables
58 */
59 extern int nbra;
60 extern char *locs; /* for stopping execess recursion */
61 extern char *loc1; /* 1st character which matched RE */
62 extern char *loc2; /* char after lst char in matched RE */
63 extern char *braslist[]; /* start of nbra subexp */
64 extern char *braelist[]; /* end of nbra subexp */
65 extern int regerrno;
66 extern int reglength;
67
68 int regcomp_flags; /* interface to specify cflags for regcomp */
69
70 void regex_comp_free(void *a);
71 static int dhl_step(const char *str, const char *ep);
72 static int dhl_advance(const char *str, const char *ep);
73 static int map_errnos(int); /* Convert regcomp error */
74 static int dhl_doit(const char *, const regex_t *, const int flags);
75 static char * dhl_compile(const char *instr, char *ep, char *endbuf);
76
77 /*
78 * # of sub re's: NOTE: For now limit on bra list defined here
79 * but fix is to add maxbra define to to regex.h
80 * One problem is that a bigger number is a performance hit since
81 * regexec() has a slow initialization loop that goes around SEPSIZE times
82 */
83 #define SEPSIZE 20
84 static regmatch_t rm[SEPSIZE]; /* ptr to list of RE matches */
85
86 /*
87 * Structure to contain dl encoded first two bytes for vi, plus hold two
88 * regex structures, one for advance and one for step.
89 */
90 static struct regex_comp {
91 char r_head[2]; /* Header for DL encoding for vi */
92 regex_t r_stp; /* For use by step */
93 regex_t r_adv; /* For use by advance */
94 } reg_comp;
95
96 /*
97 * global value for the size of a regex_comp structure:
98 */
99 size_t regexc_size = sizeof (reg_comp);
100
101
102 char *
compile(const char * instr,char * expbuf,char * endbuf)103 compile(const char *instr, char *expbuf, char *endbuf)
104 {
105 return (dhl_compile(instr, expbuf, endbuf));
106 }
107
108 int
step(const char * instr,const char * expbuf)109 step(const char *instr, const char *expbuf)
110 {
111 return (dhl_step(instr, expbuf));
112 }
113
114 int
advance(const char * instr,const char * expbuf)115 advance(const char *instr, const char *expbuf)
116 {
117 return (dhl_advance(instr, expbuf));
118 }
119
120
121 /*
122 * the compile and step routines here simulate the old libgen routines of
123 * compile/step Re: regexpr(3G). in order to do this, we must assume
124 * that expbuf[] consists of the following format:
125 * 1) the first two bytes consist of a special encoding - see below.
126 * 2) the next part is a regex_t used by regexec()/regcomp() for step
127 * 3) the final part is a regex_t used by regexec()/regcomp() for advance
128 *
129 * the special encoding of the first two bytes is referenced throughout
130 * vi. apparently expbuf[0] is set to:
131 * = 0 upon initialization
132 * = 1 if the first char of the RE is a ^
133 * = 0 if the first char of the RE isn't a ^
134 * and expbuf[1-35+] = bitmap of the type of RE chars in the expression.
135 * this is apparently 0 if there's no RE.
136 * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero
137 * if there's at least 1 RE in the string.
138 * I say "apparently" as the code to compile()/step() is poorly written.
139 */
140 static char *
dhl_compile(instr,expbuf,endbuf)141 dhl_compile(instr, expbuf, endbuf)
142 const char *instr; /* the regular expression */
143 char *expbuf; /* where the compiled RE gets placed */
144 char *endbuf; /* ending addr of expbuf */
145 {
146 int rv;
147 int alloc = 0;
148 char adv_instr[4096]; /* PLENTY big temp buffer */
149 char *instrp; /* PLENTY big temp buffer */
150
151 if (*instr == (char) NULL) {
152 regerrno = 41;
153 return (NULL);
154 }
155
156 /*
157 * Check values of expbuf and endbuf
158 */
159 if (expbuf == NULL) {
160 if ((expbuf = malloc(regexc_size)) == NULL) {
161 regerrno = 50;
162 return (NULL);
163 }
164 memset(®_comp, 0, regexc_size);
165 alloc = 1;
166 endbuf = expbuf + regexc_size;
167 } else { /* Check if enough memory was allocated */
168 if (expbuf + regexc_size > endbuf) {
169 regerrno = 50;
170 return (NULL);
171 }
172 memcpy(®_comp, expbuf, regexc_size);
173 }
174
175 /*
176 * Clear global flags
177 */
178 nbra = 0;
179 regerrno = 0;
180
181 /*
182 * Free any data being held for previous search strings
183 */
184 regex_comp_free(®_comp);
185
186 /*
187 * We call regcomp twice, once to get a regex_t for use by step()
188 * and then again with for use by advance()
189 */
190 if ((rv = regcomp(®_comp.r_stp, instr, regcomp_flags)) != 0) {
191 regerrno = map_errnos(rv); /* Convert regcomp error */
192 goto out;
193 }
194 /*
195 * To support advance, which assumes an implicit ^ to match at start
196 * of line we prepend a ^ to the pattern by copying to a temp buffer
197 */
198
199 if (instr[0] == '^')
200 instrp = (char *) instr; /* String already has leading ^ */
201 else {
202 adv_instr[0] = '^';
203 strncpy(&adv_instr[1], instr, 2048);
204 instrp = adv_instr;
205 }
206
207 if ((rv = regcomp(®_comp.r_adv, instrp, regcomp_flags)) != 0) {
208 regerrno = map_errnos(rv); /* Convert regcomp error */
209 goto out;
210 }
211
212 /*
213 * update global variables
214 */
215 nbra = (int) reg_comp.r_adv.re_nsub > 0 ?
216 (int) reg_comp.r_adv.re_nsub : 0;
217 regerrno = 0;
218
219 /*
220 * Set the header flags for use by vi
221 */
222 if (instr[0] == '^') /* if beginning of string, */
223 reg_comp.r_head[0] = 1; /* set special flag */
224 else
225 reg_comp.r_head[0] = 0; /* clear special flag */
226 /*
227 * note that for a single BRE, nbra will be 0 here.
228 * we're guaranteed that, at this point, a RE has been found.
229 */
230 reg_comp.r_head[1] = 1; /* set special flag */
231 /*
232 * Copy our reg_comp structure to expbuf
233 */
234 (void) memcpy(expbuf, (char *) ®_comp, regexc_size);
235
236 out:
237 /*
238 * Return code from libgen regcomp with mods. Note weird return
239 * value - if space is malloc'd return pointer to start of space,
240 * if user provided his own space, return pointer to 1+last byte
241 * of his space.
242 */
243 if (regerrno != 0) {
244 if (alloc)
245 free(expbuf);
246 return (NULL);
247 }
248 reglength = regexc_size;
249
250 if (alloc)
251 return (expbuf);
252 else
253 return (expbuf + regexc_size);
254 }
255
256
257 /*
258 * dhl_step: step through a string until a RE match is found, or end of str
259 */
260 static int
dhl_step(str,ep)261 dhl_step(str, ep)
262 const char *str; /* characters to be checked for a match */
263 const char *ep; /* compiled RE from dhl_compile() */
264 {
265 /*
266 * Check if we're passed a null ep
267 */
268 if (ep == NULL) {
269 regerrno = 41; /* No remembered search string error */
270 return (0);
271 }
272 /*
273 * Call common routine with r_stp (step) structure
274 */
275 return (dhl_doit(str, &(((struct regex_comp *) ep)->r_stp),
276 ((locs != NULL) ? REG_NOTBOL : 0)));
277 }
278
279 /*
280 * dhl_advance: implement advance
281 */
282 static int
dhl_advance(str,ep)283 dhl_advance(str, ep)
284 const char *str; /* characters to be checked for a match */
285 const char *ep; /* compiled RE from dhl_compile() */
286 {
287 int rv;
288 /*
289 * Check if we're passed a null ep
290 */
291 if (ep == NULL) {
292 regerrno = 41; /* No remembered search string error */
293 return (0);
294 }
295 /*
296 * Call common routine with r_adv (advance) structure
297 */
298 rv = dhl_doit(str, &(((struct regex_comp *) ep)->r_adv), 0);
299 loc1 = NULL; /* Clear it per the compile man page */
300 return (rv);
301 }
302
303 /*
304 * dhl_doit - common code for step and advance
305 */
306 static int
dhl_doit(str,rep,flags)307 dhl_doit(str, rep, flags)
308 const char *str; /* characters to be checked for a match */
309 const regex_t *rep;
310 const int flags; /* flags to be passed to regexec directly */
311 {
312 int rv;
313 int i;
314 regmatch_t *prm; /* ptr to current regmatch_t */
315
316 /*
317 * Check if we're passed a null regex_t
318 */
319 if (rep == NULL) {
320 regerrno = 41; /* No remembered search string error */
321 return (0);
322 }
323
324 regerrno = 0;
325 prm = &rm[0];
326
327 if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) {
328 if (rv == REG_NOMATCH)
329 return (0);
330 regerrno = map_errnos(rv);
331 return (0);
332 }
333
334 loc1 = (char *)str + prm->rm_so;
335 loc2 = (char *)str + prm->rm_eo;
336
337 /*
338 * Now we need to fill up the bra lists with all of the sub re's
339 * Note we subtract nsub -1, and preincrement prm.
340 */
341 for (i = 0; i <= rep->re_nsub; i++) {
342 prm++; /* XXX inc past first subexp */
343 braslist[i] = (char *)str + prm->rm_so;
344 braelist[i] = (char *)str + prm->rm_eo;
345 if (i >= SEPSIZE) {
346 regerrno = 50; /* regex overflow */
347 return (0);
348 }
349 }
350
351 /*
352 * Inverse logic, a zero from regexec - success, is a 1
353 * from advance/step.
354 */
355
356 return (rv == 0);
357 }
358
359
360 /*
361 * regerrno to compile/step error mapping:
362 * This is really a big compromise. Some errors don't map at all
363 * like regcomp error 15 is generated by both compile() error types
364 * 44 & 46. So which one should we map to?
365 * Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions
366 * To do your errors right use xregerr() to get the regcomp error
367 * string and print that.
368 *
369 * | regcomp/regexec | Compile/step/advance |
370 * +---------------------------------+--------------------------------------+
371 * 0 REG_OK Pattern matched 1 - Pattern matched
372 * 1 REG_NOMATCH No match 0 - Pattern didn't match
373 * 2 REG_ECOLLATE Bad collation elmnt. 67 - Returned by compile on mbtowc err
374 * 3 REG_EESCAPE trailing \ in patrn 45 - } expected after \.
375 * 4 REG_ENEWLINE \n before end pattrn 36 - Illegal or missing delimiter.
376 * 5 REG_ENSUB Over 9 \( \) pairs 43 - Too many \(
377 * 6 REG_ESUBREG Bad number in \[0-9] 25 - ``\digit'' out of range.
378 * 7 REG_EBRACK [ ] inbalance 49 - [ ] imbalance.
379 * 8 REG_EPAREN ( ) inbalance 42 - \(~\) imbalance.
380 * 9 REG_EBRACE \{ \} inbalance 45 - } expected after \.
381 * 10 REG_ERANGE bad range endpoint 11 - Range endpoint too large.
382 * 11 REG_ESPACE no memory for pattern 50 - Regular expression overflow.
383 * 12 REG_BADRPT invalid repetition 36 - Illegal or missing delimiter.
384 * 13 REG_ECTYPE invalid char-class 67 - illegal byte sequence
385 * 14 REG_BADPAT syntax error 50 - Regular expression overflow.
386 * 15 REG_BADBR \{ \} contents bad 46 - First number exceeds 2nd in \{~\}
387 * 16 REG_EFATAL internal error 50 - Regular expression overflow.
388 * 17 REG_ECHAR bad mulitbyte char 67 - illegal byte sequence
389 * 18 REG_STACK stack overflow 50 - Regular expression overflow.
390 * 19 REG_ENOSYS function not supported 50- Regular expression overflow.
391 *
392 * For reference here's the compile/step errno's. We don't generate
393 * 41 here - it's done earlier, nor 44 since we can't tell if from 46.
394 *
395 * 11 - Range endpoint too large.
396 * 16 - Bad number.
397 * 25 - ``\digit'' out of range.
398 * 36 - Illegal or missing delimiter.
399 * 41 - No remembered search string.
400 * 42 - \(~\) imbalance.
401 * 43 - Too many \(.
402 * 44 - More than 2 numbers given in "\{~\}"
403 * 45 - } expected after \.
404 * 46 - First number exceeds 2nd in "\{~\}"
405 * 49 - [ ] imbalance.
406 * 50 - Regular expression overflow.
407 */
408
409 static int
map_errnos(int Errno)410 map_errnos(int Errno)
411 {
412 switch (Errno) {
413 case REG_ECOLLATE:
414 regerrno = 67;
415 break;
416 case REG_EESCAPE:
417 regerrno = 45;
418 break;
419 case REG_ENEWLINE:
420 regerrno = 36;
421 break;
422 case REG_ENSUB:
423 regerrno = 43;
424 break;
425 case REG_ESUBREG:
426 regerrno = 25;
427 break;
428 case REG_EBRACK:
429 regerrno = 49;
430 break;
431 case REG_EPAREN:
432 regerrno = 42;
433 break;
434 case REG_EBRACE:
435 regerrno = 45;
436 break;
437 case REG_ERANGE:
438 regerrno = 11;
439 break;
440 case REG_ESPACE:
441 regerrno = 50;
442 break;
443 case REG_BADRPT:
444 regerrno = 36;
445 break;
446 case REG_ECTYPE:
447 regerrno = 67;
448 break;
449 case REG_BADPAT:
450 regerrno = 50;
451 break;
452 case REG_BADBR:
453 regerrno = 46;
454 break;
455 case REG_EFATAL:
456 regerrno = 50;
457 break;
458 case REG_ECHAR:
459 regerrno = 67;
460 break;
461 case REG_STACK:
462 regerrno = 50;
463 break;
464 case REG_ENOSYS:
465 regerrno = 50;
466 break;
467 default:
468 regerrno = 50;
469 break;
470 }
471 return (regerrno);
472 }
473
474 /*
475 * This is a routine to clean up the subtle substructure of the struct
476 * regex_comp type for use by clients of this module. Since the struct
477 * type is private, we use a generic interface, and trust the
478 * application to be damn sure that this operation is valid for the
479 * named memory.
480 */
481
482 void
regex_comp_free(void * a)483 regex_comp_free(void * a)
484 {
485 /*
486 * Free any data being held for previous search strings
487 */
488
489 if (((struct regex_comp *) a) == NULL) {
490 return;
491 }
492
493 regfree(&((struct regex_comp *)a)->r_stp);
494 regfree(&((struct regex_comp *)a)->r_adv);
495 }
496