1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 1995-2003 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g) 29 * using regcomp(3c), regexec(3c) interfaces. This is an XCU4 30 * porting aid. switches out to libgen compile/step if collation 31 * table not present. 32 * 33 * Goal is to work with vi and sed/ed. 34 * Returns expbuf in dhl format (encoding of first two bytes). 35 * Note also that this is profoundly single threaded. You 36 * cannot call compile twice with two separate search strings 37 * because the second call will wipe out the earlier stored string. 38 * This must be fixed, plus a general cleanup should be performed 39 * if this is to be integrated into libc. 40 * 41 */ 42 43 #pragma ident "%Z%%M% %I% %E% SMI" 44 45 #include <stdio.h> 46 #include <widec.h> 47 #include <sys/types.h> 48 #include <regex.h> 49 #include <locale.h> 50 #include <stdlib.h> 51 #include <locale.h> 52 #include <string.h> 53 #include <unistd.h> 54 #include <regexpr.h> 55 56 /* 57 * psuedo compile/step/advance global variables 58 */ 59 extern int nbra; 60 extern char *locs; /* for stopping execess recursion */ 61 extern char *loc1; /* 1st character which matched RE */ 62 extern char *loc2; /* char after lst char in matched RE */ 63 extern char *braslist[]; /* start of nbra subexp */ 64 extern char *braelist[]; /* end of nbra subexp */ 65 extern int regerrno; 66 extern int reglength; 67 68 int regcomp_flags; /* interface to specify cflags for regcomp */ 69 70 void regex_comp_free(void *a); 71 static int dhl_step(const char *str, const char *ep); 72 static int dhl_advance(const char *str, const char *ep); 73 static int map_errnos(int); /* Convert regcomp error */ 74 static int dhl_doit(const char *, const regex_t *, const int flags); 75 static char * dhl_compile(const char *instr, char *ep, char *endbuf); 76 77 /* 78 * # of sub re's: NOTE: For now limit on bra list defined here 79 * but fix is to add maxbra define to to regex.h 80 * One problem is that a bigger number is a performance hit since 81 * regexec() has a slow initialization loop that goes around SEPSIZE times 82 */ 83 #define SEPSIZE 20 84 static regmatch_t rm[SEPSIZE]; /* ptr to list of RE matches */ 85 86 /* 87 * Structure to contain dl encoded first two bytes for vi, plus hold two 88 * regex structures, one for advance and one for step. 89 */ 90 static struct regex_comp { 91 char r_head[2]; /* Header for DL encoding for vi */ 92 regex_t r_stp; /* For use by step */ 93 regex_t r_adv; /* For use by advance */ 94 } reg_comp; 95 96 /* 97 * global value for the size of a regex_comp structure: 98 */ 99 size_t regexc_size = sizeof (reg_comp); 100 101 102 char * 103 compile(const char *instr, char *expbuf, char *endbuf) 104 { 105 return (dhl_compile(instr, expbuf, endbuf)); 106 } 107 108 int 109 step(const char *instr, const char *expbuf) 110 { 111 return (dhl_step(instr, expbuf)); 112 } 113 114 int 115 advance(const char *instr, const char *expbuf) 116 { 117 return (dhl_advance(instr, expbuf)); 118 } 119 120 121 /* 122 * the compile and step routines here simulate the old libgen routines of 123 * compile/step Re: regexpr(3G). in order to do this, we must assume 124 * that expbuf[] consists of the following format: 125 * 1) the first two bytes consist of a special encoding - see below. 126 * 2) the next part is a regex_t used by regexec()/regcomp() for step 127 * 3) the final part is a regex_t used by regexec()/regcomp() for advance 128 * 129 * the special encoding of the first two bytes is referenced throughout 130 * vi. apparently expbuf[0] is set to: 131 * = 0 upon initialization 132 * = 1 if the first char of the RE is a ^ 133 * = 0 if the first char of the RE isn't a ^ 134 * and expbuf[1-35+] = bitmap of the type of RE chars in the expression. 135 * this is apparently 0 if there's no RE. 136 * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero 137 * if there's at least 1 RE in the string. 138 * I say "apparently" as the code to compile()/step() is poorly written. 139 */ 140 static char * 141 dhl_compile(instr, expbuf, endbuf) 142 const char *instr; /* the regular expression */ 143 char *expbuf; /* where the compiled RE gets placed */ 144 char *endbuf; /* ending addr of expbuf */ 145 { 146 int rv; 147 int alloc = 0; 148 char adv_instr[4096]; /* PLENTY big temp buffer */ 149 char *instrp; /* PLENTY big temp buffer */ 150 151 if (*instr == (char) NULL) { 152 regerrno = 41; 153 return (NULL); 154 } 155 156 /* 157 * Check values of expbuf and endbuf 158 */ 159 if (expbuf == NULL) { 160 if ((expbuf = malloc(regexc_size)) == NULL) { 161 regerrno = 50; 162 return (NULL); 163 } 164 memset(®_comp, 0, regexc_size); 165 alloc = 1; 166 endbuf = expbuf + regexc_size; 167 } else { /* Check if enough memory was allocated */ 168 if (expbuf + regexc_size > endbuf) { 169 regerrno = 50; 170 return (NULL); 171 } 172 memcpy(®_comp, expbuf, regexc_size); 173 } 174 175 /* 176 * Clear global flags 177 */ 178 nbra = 0; 179 regerrno = 0; 180 181 /* 182 * Free any data being held for previous search strings 183 */ 184 regex_comp_free(®_comp); 185 186 /* 187 * We call regcomp twice, once to get a regex_t for use by step() 188 * and then again with for use by advance() 189 */ 190 if ((rv = regcomp(®_comp.r_stp, instr, regcomp_flags)) != 0) { 191 regerrno = map_errnos(rv); /* Convert regcomp error */ 192 goto out; 193 } 194 /* 195 * To support advance, which assumes an implicit ^ to match at start 196 * of line we prepend a ^ to the pattern by copying to a temp buffer 197 */ 198 199 if (instr[0] == '^') 200 instrp = (char *) instr; /* String already has leading ^ */ 201 else { 202 adv_instr[0] = '^'; 203 strncpy(&adv_instr[1], instr, 2048); 204 instrp = adv_instr; 205 } 206 207 if ((rv = regcomp(®_comp.r_adv, instrp, regcomp_flags)) != 0) { 208 regerrno = map_errnos(rv); /* Convert regcomp error */ 209 goto out; 210 } 211 212 /* 213 * update global variables 214 */ 215 nbra = (int) reg_comp.r_adv.re_nsub > 0 ? 216 (int) reg_comp.r_adv.re_nsub : 0; 217 regerrno = 0; 218 219 /* 220 * Set the header flags for use by vi 221 */ 222 if (instr[0] == '^') /* if beginning of string, */ 223 reg_comp.r_head[0] = 1; /* set special flag */ 224 else 225 reg_comp.r_head[0] = 0; /* clear special flag */ 226 /* 227 * note that for a single BRE, nbra will be 0 here. 228 * we're guaranteed that, at this point, a RE has been found. 229 */ 230 reg_comp.r_head[1] = 1; /* set special flag */ 231 /* 232 * Copy our reg_comp structure to expbuf 233 */ 234 (void) memcpy(expbuf, (char *) ®_comp, regexc_size); 235 236 out: 237 /* 238 * Return code from libgen regcomp with mods. Note weird return 239 * value - if space is malloc'd return pointer to start of space, 240 * if user provided his own space, return pointer to 1+last byte 241 * of his space. 242 */ 243 if (regerrno != 0) { 244 if (alloc) 245 free(expbuf); 246 return (NULL); 247 } 248 reglength = regexc_size; 249 250 if (alloc) 251 return (expbuf); 252 else 253 return (expbuf + regexc_size); 254 } 255 256 257 /* 258 * dhl_step: step through a string until a RE match is found, or end of str 259 */ 260 static int 261 dhl_step(str, ep) 262 const char *str; /* characters to be checked for a match */ 263 const char *ep; /* compiled RE from dhl_compile() */ 264 { 265 /* 266 * Check if we're passed a null ep 267 */ 268 if (ep == NULL) { 269 regerrno = 41; /* No remembered search string error */ 270 return (0); 271 } 272 /* 273 * Call common routine with r_stp (step) structure 274 */ 275 return (dhl_doit(str, &(((struct regex_comp *) ep)->r_stp), 276 ((locs != NULL) ? REG_NOTBOL : 0))); 277 } 278 279 /* 280 * dhl_advance: implement advance 281 */ 282 static int 283 dhl_advance(str, ep) 284 const char *str; /* characters to be checked for a match */ 285 const char *ep; /* compiled RE from dhl_compile() */ 286 { 287 int rv; 288 /* 289 * Check if we're passed a null ep 290 */ 291 if (ep == NULL) { 292 regerrno = 41; /* No remembered search string error */ 293 return (0); 294 } 295 /* 296 * Call common routine with r_adv (advance) structure 297 */ 298 rv = dhl_doit(str, &(((struct regex_comp *) ep)->r_adv), 0); 299 loc1 = NULL; /* Clear it per the compile man page */ 300 return (rv); 301 } 302 303 /* 304 * dhl_doit - common code for step and advance 305 */ 306 static int 307 dhl_doit(str, rep, flags) 308 const char *str; /* characters to be checked for a match */ 309 const regex_t *rep; 310 const int flags; /* flags to be passed to regexec directly */ 311 { 312 int rv; 313 int i; 314 regmatch_t *prm; /* ptr to current regmatch_t */ 315 316 /* 317 * Check if we're passed a null regex_t 318 */ 319 if (rep == NULL) { 320 regerrno = 41; /* No remembered search string error */ 321 return (0); 322 } 323 324 regerrno = 0; 325 prm = &rm[0]; 326 327 if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) { 328 if (rv == REG_NOMATCH) 329 return (0); 330 regerrno = map_errnos(rv); 331 return (0); 332 } 333 334 loc1 = (char *)str + prm->rm_so; 335 loc2 = (char *)str + prm->rm_eo; 336 337 /* 338 * Now we need to fill up the bra lists with all of the sub re's 339 * Note we subtract nsub -1, and preincrement prm. 340 */ 341 for (i = 0; i <= rep->re_nsub; i++) { 342 prm++; /* XXX inc past first subexp */ 343 braslist[i] = (char *)str + prm->rm_so; 344 braelist[i] = (char *)str + prm->rm_eo; 345 if (i >= SEPSIZE) { 346 regerrno = 50; /* regex overflow */ 347 return (0); 348 } 349 } 350 351 /* 352 * Inverse logic, a zero from regexec - success, is a 1 353 * from advance/step. 354 */ 355 356 return (rv == 0); 357 } 358 359 360 /* 361 * regerrno to compile/step error mapping: 362 * This is really a big compromise. Some errors don't map at all 363 * like regcomp error 15 is generated by both compile() error types 364 * 44 & 46. So which one should we map to? 365 * Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions 366 * To do your errors right use xregerr() to get the regcomp error 367 * string and print that. 368 * 369 * | regcomp/regexec | Compile/step/advance | 370 * +---------------------------------+--------------------------------------+ 371 * 0 REG_OK Pattern matched 1 - Pattern matched 372 * 1 REG_NOMATCH No match 0 - Pattern didn't match 373 * 2 REG_ECOLLATE Bad collation elmnt. 67 - Returned by compile on mbtowc err 374 * 3 REG_EESCAPE trailing \ in patrn 45 - } expected after \. 375 * 4 REG_ENEWLINE \n before end pattrn 36 - Illegal or missing delimiter. 376 * 5 REG_ENSUB Over 9 \( \) pairs 43 - Too many \( 377 * 6 REG_ESUBREG Bad number in \[0-9] 25 - ``\digit'' out of range. 378 * 7 REG_EBRACK [ ] inbalance 49 - [ ] imbalance. 379 * 8 REG_EPAREN ( ) inbalance 42 - \(~\) imbalance. 380 * 9 REG_EBRACE \{ \} inbalance 45 - } expected after \. 381 * 10 REG_ERANGE bad range endpoint 11 - Range endpoint too large. 382 * 11 REG_ESPACE no memory for pattern 50 - Regular expression overflow. 383 * 12 REG_BADRPT invalid repetition 36 - Illegal or missing delimiter. 384 * 13 REG_ECTYPE invalid char-class 67 - illegal byte sequence 385 * 14 REG_BADPAT syntax error 50 - Regular expression overflow. 386 * 15 REG_BADBR \{ \} contents bad 46 - First number exceeds 2nd in \{~\} 387 * 16 REG_EFATAL internal error 50 - Regular expression overflow. 388 * 17 REG_ECHAR bad mulitbyte char 67 - illegal byte sequence 389 * 18 REG_STACK stack overflow 50 - Regular expression overflow. 390 * 19 REG_ENOSYS function not supported 50- Regular expression overflow. 391 * 392 * For reference here's the compile/step errno's. We don't generate 393 * 41 here - it's done earlier, nor 44 since we can't tell if from 46. 394 * 395 * 11 - Range endpoint too large. 396 * 16 - Bad number. 397 * 25 - ``\digit'' out of range. 398 * 36 - Illegal or missing delimiter. 399 * 41 - No remembered search string. 400 * 42 - \(~\) imbalance. 401 * 43 - Too many \(. 402 * 44 - More than 2 numbers given in "\{~\}" 403 * 45 - } expected after \. 404 * 46 - First number exceeds 2nd in "\{~\}" 405 * 49 - [ ] imbalance. 406 * 50 - Regular expression overflow. 407 */ 408 409 static int 410 map_errnos(int Errno) 411 { 412 switch (Errno) { 413 case REG_ECOLLATE: 414 regerrno = 67; 415 break; 416 case REG_EESCAPE: 417 regerrno = 45; 418 break; 419 case REG_ENEWLINE: 420 regerrno = 36; 421 break; 422 case REG_ENSUB: 423 regerrno = 43; 424 break; 425 case REG_ESUBREG: 426 regerrno = 25; 427 break; 428 case REG_EBRACK: 429 regerrno = 49; 430 break; 431 case REG_EPAREN: 432 regerrno = 42; 433 break; 434 case REG_EBRACE: 435 regerrno = 45; 436 break; 437 case REG_ERANGE: 438 regerrno = 11; 439 break; 440 case REG_ESPACE: 441 regerrno = 50; 442 break; 443 case REG_BADRPT: 444 regerrno = 36; 445 break; 446 case REG_ECTYPE: 447 regerrno = 67; 448 break; 449 case REG_BADPAT: 450 regerrno = 50; 451 break; 452 case REG_BADBR: 453 regerrno = 46; 454 break; 455 case REG_EFATAL: 456 regerrno = 50; 457 break; 458 case REG_ECHAR: 459 regerrno = 67; 460 break; 461 case REG_STACK: 462 regerrno = 50; 463 break; 464 case REG_ENOSYS: 465 regerrno = 50; 466 break; 467 default: 468 regerrno = 50; 469 break; 470 } 471 return (regerrno); 472 } 473 474 /* 475 * This is a routine to clean up the subtle substructure of the struct 476 * regex_comp type for use by clients of this module. Since the struct 477 * type is private, we use a generic interface, and trust the 478 * application to be damn sure that this operation is valid for the 479 * named memory. 480 */ 481 482 void 483 regex_comp_free(void * a) 484 { 485 /* 486 * Free any data being held for previous search strings 487 */ 488 489 if (((struct regex_comp *) a) == NULL) { 490 return; 491 } 492 493 regfree(&((struct regex_comp *)a)->r_stp); 494 regfree(&((struct regex_comp *)a)->r_adv); 495 } 496