1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 1995-2003 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 */ 27 28 /* 29 * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g) 30 * using regcomp(3c), regexec(3c) interfaces. This is an XCU4 31 * porting aid. switches out to libgen compile/step if collation 32 * table not present. 33 * 34 * Goal is to work with vi and sed/ed. 35 * Returns expbuf in dhl format (encoding of first two bytes). 36 * Note also that this is profoundly single threaded. You 37 * cannot call compile twice with two separate search strings 38 * because the second call will wipe out the earlier stored string. 39 * This must be fixed, plus a general cleanup should be performed 40 * if this is to be integrated into libc. 41 * 42 */ 43 44 #include <stdio.h> 45 #include <widec.h> 46 #include <sys/types.h> 47 #include <regex.h> 48 #include <locale.h> 49 #include <stdlib.h> 50 #include <locale.h> 51 #include <string.h> 52 #include <unistd.h> 53 #include <regexpr.h> 54 55 /* 56 * psuedo compile/step/advance global variables 57 */ 58 extern int nbra; 59 extern char *locs; /* for stopping execess recursion */ 60 extern char *loc1; /* 1st character which matched RE */ 61 extern char *loc2; /* char after lst char in matched RE */ 62 extern char *braslist[]; /* start of nbra subexp */ 63 extern char *braelist[]; /* end of nbra subexp */ 64 extern int regerrno; 65 extern int reglength; 66 67 int regcomp_flags; /* interface to specify cflags for regcomp */ 68 69 void regex_comp_free(void *a); 70 static int dhl_step(const char *str, const char *ep); 71 static int dhl_advance(const char *str, const char *ep); 72 static int map_errnos(int); /* Convert regcomp error */ 73 static int dhl_doit(const char *, const regex_t *, const int flags); 74 static char *dhl_compile(const char *instr, char *ep, char *endbuf); 75 76 /* 77 * # of sub re's: NOTE: For now limit on bra list defined here 78 * but fix is to add maxbra define to to regex.h 79 * One problem is that a bigger number is a performance hit since 80 * regexec() has a slow initialization loop that goes around SEPSIZE times 81 */ 82 #define SEPSIZE 20 83 static regmatch_t rm[SEPSIZE]; /* ptr to list of RE matches */ 84 85 /* 86 * Structure to contain dl encoded first two bytes for vi, plus hold two 87 * regex structures, one for advance and one for step. 88 */ 89 static struct regex_comp { 90 char r_head[2]; /* Header for DL encoding for vi */ 91 regex_t r_stp; /* For use by step */ 92 regex_t r_adv; /* For use by advance */ 93 } reg_comp; 94 95 /* 96 * global value for the size of a regex_comp structure: 97 */ 98 size_t regexc_size = sizeof (reg_comp); 99 100 101 char * 102 compile(const char *instr, char *expbuf, char *endbuf) 103 { 104 return (dhl_compile(instr, expbuf, endbuf)); 105 } 106 107 int 108 step(const char *instr, const char *expbuf) 109 { 110 return (dhl_step(instr, expbuf)); 111 } 112 113 int 114 advance(const char *instr, const char *expbuf) 115 { 116 return (dhl_advance(instr, expbuf)); 117 } 118 119 120 /* 121 * the compile and step routines here simulate the old libgen routines of 122 * compile/step Re: regexpr(3GEN). in order to do this, we must assume 123 * that expbuf[] consists of the following format: 124 * 1) the first two bytes consist of a special encoding - see below. 125 * 2) the next part is a regex_t used by regexec()/regcomp() for step 126 * 3) the final part is a regex_t used by regexec()/regcomp() for advance 127 * 128 * the special encoding of the first two bytes is referenced throughout 129 * vi. apparently expbuf[0] is set to: 130 * = 0 upon initialization 131 * = 1 if the first char of the RE is a ^ 132 * = 0 if the first char of the RE isn't a ^ 133 * and expbuf[1-35+] = bitmap of the type of RE chars in the expression. 134 * this is apparently 0 if there's no RE. 135 * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero 136 * if there's at least 1 RE in the string. 137 * I say "apparently" as the code to compile()/step() is poorly written. 138 */ 139 static char * 140 dhl_compile(const char *instr, /* the regular expression */ 141 char *expbuf, /* where the compiled RE gets placed */ 142 char *endbuf) /* ending addr of expbuf */ 143 { 144 int rv; 145 int alloc = 0; 146 char adv_instr[4096]; /* PLENTY big temp buffer */ 147 char *instrp; /* PLENTY big temp buffer */ 148 149 if (*instr == '\0') { 150 regerrno = 41; 151 return (NULL); 152 } 153 154 /* 155 * Check values of expbuf and endbuf 156 */ 157 if (expbuf == NULL) { 158 if ((expbuf = malloc(regexc_size)) == NULL) { 159 regerrno = 50; 160 return (NULL); 161 } 162 memset(®_comp, 0, regexc_size); 163 alloc = 1; 164 endbuf = expbuf + regexc_size; 165 } else { /* Check if enough memory was allocated */ 166 if (expbuf + regexc_size > endbuf) { 167 regerrno = 50; 168 return (NULL); 169 } 170 memcpy(®_comp, expbuf, regexc_size); 171 } 172 173 /* 174 * Clear global flags 175 */ 176 nbra = 0; 177 regerrno = 0; 178 179 /* 180 * Free any data being held for previous search strings 181 */ 182 regex_comp_free(®_comp); 183 184 /* 185 * We call regcomp twice, once to get a regex_t for use by step() 186 * and then again with for use by advance() 187 */ 188 if ((rv = regcomp(®_comp.r_stp, instr, regcomp_flags)) != 0) { 189 regerrno = map_errnos(rv); /* Convert regcomp error */ 190 goto out; 191 } 192 /* 193 * To support advance, which assumes an implicit ^ to match at start 194 * of line we prepend a ^ to the pattern by copying to a temp buffer 195 */ 196 197 if (instr[0] == '^') 198 instrp = (char *)instr; /* String already has leading ^ */ 199 else { 200 adv_instr[0] = '^'; 201 strncpy(&adv_instr[1], instr, 2048); 202 instrp = adv_instr; 203 } 204 205 if ((rv = regcomp(®_comp.r_adv, instrp, regcomp_flags)) != 0) { 206 regerrno = map_errnos(rv); /* Convert regcomp error */ 207 goto out; 208 } 209 210 /* 211 * update global variables 212 */ 213 nbra = (int)reg_comp.r_adv.re_nsub > 0 ? 214 (int)reg_comp.r_adv.re_nsub : 0; 215 regerrno = 0; 216 217 /* 218 * Set the header flags for use by vi 219 */ 220 if (instr[0] == '^') /* if beginning of string, */ 221 reg_comp.r_head[0] = 1; /* set special flag */ 222 else 223 reg_comp.r_head[0] = 0; /* clear special flag */ 224 /* 225 * note that for a single BRE, nbra will be 0 here. 226 * we're guaranteed that, at this point, a RE has been found. 227 */ 228 reg_comp.r_head[1] = 1; /* set special flag */ 229 /* 230 * Copy our reg_comp structure to expbuf 231 */ 232 (void) memcpy(expbuf, (char *)®_comp, regexc_size); 233 234 out: 235 /* 236 * Return code from libgen regcomp with mods. Note weird return 237 * value - if space is malloc'd return pointer to start of space, 238 * if user provided their own space, return pointer to 1+last byte 239 * of that space. 240 */ 241 if (regerrno != 0) { 242 if (alloc) 243 free(expbuf); 244 return (NULL); 245 } 246 reglength = regexc_size; 247 248 if (alloc) 249 return (expbuf); 250 else 251 return (expbuf + regexc_size); 252 } 253 254 255 /* 256 * dhl_step: step through a string until a RE match is found, or end of str 257 */ 258 static int 259 dhl_step(const char *str, /* characters to be checked for a match */ 260 const char *ep) /* compiled RE from dhl_compile() */ 261 { 262 /* 263 * Check if we're passed a null ep 264 */ 265 if (ep == NULL) { 266 regerrno = 41; /* No remembered search string error */ 267 return (0); 268 } 269 /* 270 * Call common routine with r_stp (step) structure 271 */ 272 return (dhl_doit(str, &(((struct regex_comp *)ep)->r_stp), 273 ((locs != NULL) ? REG_NOTBOL : 0))); 274 } 275 276 /* 277 * dhl_advance: implement advance 278 */ 279 static int 280 dhl_advance(const char *str, /* characters to be checked for a match */ 281 const char *ep) /* compiled RE from dhl_compile() */ 282 { 283 int rv; 284 /* 285 * Check if we're passed a null ep 286 */ 287 if (ep == NULL) { 288 regerrno = 41; /* No remembered search string error */ 289 return (0); 290 } 291 /* 292 * Call common routine with r_adv (advance) structure 293 */ 294 rv = dhl_doit(str, &(((struct regex_comp *)ep)->r_adv), 0); 295 loc1 = NULL; /* Clear it per the compile man page */ 296 return (rv); 297 } 298 299 /* 300 * dhl_doit - common code for step and advance 301 */ 302 static int 303 dhl_doit(const char *str, /* characters to be checked for a match */ 304 const regex_t *rep, 305 const int flags) /* flags to be passed to regexec directly */ 306 { 307 int rv; 308 int i; 309 regmatch_t *prm; /* ptr to current regmatch_t */ 310 311 /* 312 * Check if we're passed a null regex_t 313 */ 314 if (rep == NULL) { 315 regerrno = 41; /* No remembered search string error */ 316 return (0); 317 } 318 319 regerrno = 0; 320 prm = &rm[0]; 321 322 if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) { 323 if (rv == REG_NOMATCH) 324 return (0); 325 regerrno = map_errnos(rv); 326 return (0); 327 } 328 329 loc1 = (char *)str + prm->rm_so; 330 loc2 = (char *)str + prm->rm_eo; 331 332 /* 333 * Now we need to fill up the bra lists with all of the sub re's 334 * Note we subtract nsub -1, and preincrement prm. 335 */ 336 for (i = 0; i <= rep->re_nsub; i++) { 337 prm++; /* XXX inc past first subexp */ 338 braslist[i] = (char *)str + prm->rm_so; 339 braelist[i] = (char *)str + prm->rm_eo; 340 if (i >= SEPSIZE) { 341 regerrno = 50; /* regex overflow */ 342 return (0); 343 } 344 } 345 346 /* 347 * Inverse logic, a zero from regexec - success, is a 1 348 * from advance/step. 349 */ 350 351 return (rv == 0); 352 } 353 354 355 /* 356 * regerrno to compile/step error mapping: 357 * This is really a big compromise. Some errors don't map at all 358 * like regcomp error 15 is generated by both compile() error types 359 * 44 & 46. So which one should we map to? 360 * Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions 361 * To do your errors right use xregerr() to get the regcomp error 362 * string and print that. 363 * 364 * | regcomp/regexec | Compile/step/advance | 365 * +---------------------------------+--------------------------------------+ 366 * 0 REG_OK Pattern matched 1 - Pattern matched 367 * 1 REG_NOMATCH No match 0 - Pattern didn't match 368 * 2 REG_ECOLLATE Bad collation elmnt. 67 - Returned by compile on mbtowc err 369 * 3 REG_EESCAPE trailing \ in patrn 45 - } expected after \. 370 * 4 REG_ENEWLINE \n before end pattrn 36 - Illegal or missing delimiter. 371 * 5 REG_ENSUB Over 9 \( \) pairs 43 - Too many \( 372 * 6 REG_ESUBREG Bad number in \[0-9] 25 - ``\digit'' out of range. 373 * 7 REG_EBRACK [ ] inbalance 49 - [ ] imbalance. 374 * 8 REG_EPAREN ( ) inbalance 42 - \(~\) imbalance. 375 * 9 REG_EBRACE \{ \} inbalance 45 - } expected after \. 376 * 10 REG_ERANGE bad range endpoint 11 - Range endpoint too large. 377 * 11 REG_ESPACE no memory for pattern 50 - Regular expression overflow. 378 * 12 REG_BADRPT invalid repetition 36 - Illegal or missing delimiter. 379 * 13 REG_ECTYPE invalid char-class 67 - illegal byte sequence 380 * 14 REG_BADPAT syntax error 50 - Regular expression overflow. 381 * 15 REG_BADBR \{ \} contents bad 46 - First number exceeds 2nd in \{~\} 382 * 16 REG_EFATAL internal error 50 - Regular expression overflow. 383 * 17 REG_ECHAR bad mulitbyte char 67 - illegal byte sequence 384 * 18 REG_STACK stack overflow 50 - Regular expression overflow. 385 * 19 REG_ENOSYS function not supported 50- Regular expression overflow. 386 * 387 * For reference here's the compile/step errno's. We don't generate 388 * 41 here - it's done earlier, nor 44 since we can't tell if from 46. 389 * 390 * 11 - Range endpoint too large. 391 * 16 - Bad number. 392 * 25 - ``\digit'' out of range. 393 * 36 - Illegal or missing delimiter. 394 * 41 - No remembered search string. 395 * 42 - \(~\) imbalance. 396 * 43 - Too many \(. 397 * 44 - More than 2 numbers given in "\{~\}" 398 * 45 - } expected after \. 399 * 46 - First number exceeds 2nd in "\{~\}" 400 * 49 - [ ] imbalance. 401 * 50 - Regular expression overflow. 402 */ 403 404 static int 405 map_errnos(int Errno) 406 { 407 switch (Errno) { 408 case REG_ECOLLATE: 409 regerrno = 67; 410 break; 411 case REG_EESCAPE: 412 regerrno = 45; 413 break; 414 case REG_ENEWLINE: 415 regerrno = 36; 416 break; 417 case REG_ENSUB: 418 regerrno = 43; 419 break; 420 case REG_ESUBREG: 421 regerrno = 25; 422 break; 423 case REG_EBRACK: 424 regerrno = 49; 425 break; 426 case REG_EPAREN: 427 regerrno = 42; 428 break; 429 case REG_EBRACE: 430 regerrno = 45; 431 break; 432 case REG_ERANGE: 433 regerrno = 11; 434 break; 435 case REG_ESPACE: 436 regerrno = 50; 437 break; 438 case REG_BADRPT: 439 regerrno = 36; 440 break; 441 case REG_ECTYPE: 442 regerrno = 67; 443 break; 444 case REG_BADPAT: 445 regerrno = 50; 446 break; 447 case REG_BADBR: 448 regerrno = 46; 449 break; 450 case REG_EFATAL: 451 regerrno = 50; 452 break; 453 case REG_ECHAR: 454 regerrno = 67; 455 break; 456 case REG_STACK: 457 regerrno = 50; 458 break; 459 case REG_ENOSYS: 460 regerrno = 50; 461 break; 462 default: 463 regerrno = 50; 464 break; 465 } 466 return (regerrno); 467 } 468 469 /* 470 * This is a routine to clean up the subtle substructure of the struct 471 * regex_comp type for use by clients of this module. Since the struct 472 * type is private, we use a generic interface, and trust the 473 * application to be damn sure that this operation is valid for the 474 * named memory. 475 */ 476 477 void 478 regex_comp_free(void *a) 479 { 480 /* 481 * Free any data being held for previous search strings 482 */ 483 484 if (a == NULL) { 485 return; 486 } 487 488 regfree(&((struct regex_comp *)a)->r_stp); 489 regfree(&((struct regex_comp *)a)->r_adv); 490 } 491