1 /*- 2 * Copyright (c) 1992, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * Copyright (c) 1992, 1993, 1994, 1995, 1996 5 * Keith Bostic. All rights reserved. 6 * 7 * See the LICENSE file for redistribution information. 8 */ 9 10 #include "config.h" 11 12 #ifndef lint 13 static const char sccsid[] = "$Id: ex_subst.c,v 10.53 2011/12/21 20:40:35 zy Exp $"; 14 #endif /* not lint */ 15 16 #include <sys/types.h> 17 #include <sys/queue.h> 18 #include <sys/time.h> 19 20 #include <bitstring.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <string.h> 27 #include <unistd.h> 28 29 #include "../common/common.h" 30 #include "../vi/vi.h" 31 32 #define SUB_FIRST 0x01 /* The 'r' flag isn't reasonable. */ 33 #define SUB_MUSTSETR 0x02 /* The 'r' flag is required. */ 34 35 static int re_conv(SCR *, CHAR_T **, size_t *, int *); 36 static int re_cscope_conv(SCR *, CHAR_T **, size_t *, int *); 37 static int re_sub(SCR *, 38 CHAR_T *, CHAR_T **, size_t *, size_t *, regmatch_t [10]); 39 static int re_tag_conv(SCR *, CHAR_T **, size_t *, int *); 40 static int s(SCR *, EXCMD *, CHAR_T *, regex_t *, u_int); 41 42 /* 43 * ex_s -- 44 * [line [,line]] s[ubstitute] [[/;]pat[/;]/repl[/;] [cgr] [count] [#lp]] 45 * 46 * Substitute on lines matching a pattern. 47 * 48 * PUBLIC: int ex_s(SCR *, EXCMD *); 49 */ 50 int 51 ex_s(SCR *sp, EXCMD *cmdp) 52 { 53 regex_t *re; 54 size_t blen, len; 55 u_int flags; 56 int delim; 57 CHAR_T *bp, *p, *ptrn, *rep, *t; 58 59 /* 60 * Skip leading white space. 61 * 62 * !!! 63 * Historic vi allowed any non-alphanumeric to serve as the 64 * substitution command delimiter. 65 * 66 * !!! 67 * If the arguments are empty, it's the same as &, i.e. we 68 * repeat the last substitution. 69 */ 70 if (cmdp->argc == 0) 71 goto subagain; 72 for (p = cmdp->argv[0]->bp, 73 len = cmdp->argv[0]->len; len > 0; --len, ++p) { 74 if (!cmdskip(*p)) 75 break; 76 } 77 if (len == 0) 78 subagain: return (ex_subagain(sp, cmdp)); 79 80 delim = *p++; 81 if (!isascii(delim) || isalnum(delim) || delim == '\\') 82 return (s(sp, cmdp, p, &sp->subre_c, SUB_MUSTSETR)); 83 84 /* 85 * !!! 86 * The full-blown substitute command reset the remembered 87 * state of the 'c' and 'g' suffices. 88 */ 89 sp->c_suffix = sp->g_suffix = 0; 90 91 /* 92 * Get the pattern string, toss escaping characters. 93 * 94 * !!! 95 * Historic vi accepted any of the following forms: 96 * 97 * :s/abc/def/ change "abc" to "def" 98 * :s/abc/def change "abc" to "def" 99 * :s/abc/ delete "abc" 100 * :s/abc delete "abc" 101 * 102 * QUOTING NOTE: 103 * 104 * Only toss an escaping character if it escapes a delimiter. 105 * This means that "s/A/\\\\f" replaces "A" with "\\f". It 106 * would be nice to be more regular, i.e. for each layer of 107 * escaping a single escaping character is removed, but that's 108 * not how the historic vi worked. 109 */ 110 for (ptrn = t = p;;) { 111 if (p[0] == '\0' || p[0] == delim) { 112 if (p[0] == delim) 113 ++p; 114 /* 115 * !!! 116 * Nul terminate the pattern string -- it's passed 117 * to regcomp which doesn't understand anything else. 118 */ 119 *t = '\0'; 120 break; 121 } 122 if (p[0] == '\\') 123 if (p[1] == delim) 124 ++p; 125 else if (p[1] == '\\') 126 *t++ = *p++; 127 *t++ = *p++; 128 } 129 130 /* 131 * If the pattern string is empty, use the last RE (not just the 132 * last substitution RE). 133 */ 134 if (*ptrn == '\0') { 135 if (sp->re == NULL) { 136 ex_emsg(sp, NULL, EXM_NOPREVRE); 137 return (1); 138 } 139 140 /* Re-compile the RE if necessary. */ 141 if (!F_ISSET(sp, SC_RE_SEARCH) && 142 re_compile(sp, sp->re, sp->re_len, 143 NULL, NULL, &sp->re_c, RE_C_SEARCH)) 144 return (1); 145 flags = 0; 146 } else { 147 /* 148 * !!! 149 * Compile the RE. Historic practice is that substitutes set 150 * the search direction as well as both substitute and search 151 * RE's. We compile the RE twice, as we don't want to bother 152 * ref counting the pattern string and (opaque) structure. 153 */ 154 if (re_compile(sp, ptrn, t - ptrn, &sp->re, 155 &sp->re_len, &sp->re_c, RE_C_SEARCH)) 156 return (1); 157 if (re_compile(sp, ptrn, t - ptrn, &sp->subre, 158 &sp->subre_len, &sp->subre_c, RE_C_SUBST)) 159 return (1); 160 161 flags = SUB_FIRST; 162 sp->searchdir = FORWARD; 163 } 164 re = &sp->re_c; 165 166 /* 167 * Get the replacement string. 168 * 169 * The special character & (\& if O_MAGIC not set) matches the 170 * entire RE. No handling of & is required here, it's done by 171 * re_sub(). 172 * 173 * The special character ~ (\~ if O_MAGIC not set) inserts the 174 * previous replacement string into this replacement string. 175 * Count ~'s to figure out how much space we need. We could 176 * special case nonexistent last patterns or whether or not 177 * O_MAGIC is set, but it's probably not worth the effort. 178 * 179 * QUOTING NOTE: 180 * 181 * Only toss an escaping character if it escapes a delimiter or 182 * if O_MAGIC is set and it escapes a tilde. 183 * 184 * !!! 185 * If the entire replacement pattern is "%", then use the last 186 * replacement pattern. This semantic was added to vi in System 187 * V and then percolated elsewhere, presumably around the time 188 * that it was added to their version of ed(1). 189 */ 190 if (p[0] == '\0' || p[0] == delim) { 191 if (p[0] == delim) 192 ++p; 193 if (sp->repl != NULL) 194 free(sp->repl); 195 sp->repl = NULL; 196 sp->repl_len = 0; 197 } else if (p[0] == '%' && (p[1] == '\0' || p[1] == delim)) 198 p += p[1] == delim ? 2 : 1; 199 else { 200 for (rep = p, len = 0; 201 p[0] != '\0' && p[0] != delim; ++p, ++len) 202 if (p[0] == '~') 203 len += sp->repl_len; 204 GET_SPACE_RETW(sp, bp, blen, len); 205 for (t = bp, len = 0, p = rep;;) { 206 if (p[0] == '\0' || p[0] == delim) { 207 if (p[0] == delim) 208 ++p; 209 break; 210 } 211 if (p[0] == '\\') { 212 if (p[1] == delim) 213 ++p; 214 else if (p[1] == '\\') { 215 *t++ = *p++; 216 ++len; 217 } else if (p[1] == '~') { 218 ++p; 219 if (!O_ISSET(sp, O_MAGIC)) 220 goto tilde; 221 } 222 } else if (p[0] == '~' && O_ISSET(sp, O_MAGIC)) { 223 tilde: ++p; 224 MEMCPY(t, sp->repl, sp->repl_len); 225 t += sp->repl_len; 226 len += sp->repl_len; 227 continue; 228 } 229 *t++ = *p++; 230 ++len; 231 } 232 if ((sp->repl_len = len) != 0) { 233 if (sp->repl != NULL) 234 free(sp->repl); 235 MALLOC(sp, sp->repl, CHAR_T *, len * sizeof(CHAR_T)); 236 if (sp->repl == NULL) { 237 FREE_SPACEW(sp, bp, blen); 238 return (1); 239 } 240 MEMCPY(sp->repl, bp, len); 241 } 242 FREE_SPACEW(sp, bp, blen); 243 } 244 return (s(sp, cmdp, p, re, flags)); 245 } 246 247 /* 248 * ex_subagain -- 249 * [line [,line]] & [cgr] [count] [#lp]] 250 * 251 * Substitute using the last substitute RE and replacement pattern. 252 * 253 * PUBLIC: int ex_subagain(SCR *, EXCMD *); 254 */ 255 int 256 ex_subagain(SCR *sp, EXCMD *cmdp) 257 { 258 if (sp->subre == NULL) { 259 ex_emsg(sp, NULL, EXM_NOPREVRE); 260 return (1); 261 } 262 if (!F_ISSET(sp, SC_RE_SUBST) && 263 re_compile(sp, sp->subre, sp->subre_len, 264 NULL, NULL, &sp->subre_c, RE_C_SUBST)) 265 return (1); 266 return (s(sp, 267 cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->subre_c, 0)); 268 } 269 270 /* 271 * ex_subtilde -- 272 * [line [,line]] ~ [cgr] [count] [#lp]] 273 * 274 * Substitute using the last RE and last substitute replacement pattern. 275 * 276 * PUBLIC: int ex_subtilde(SCR *, EXCMD *); 277 */ 278 int 279 ex_subtilde(SCR *sp, EXCMD *cmdp) 280 { 281 if (sp->re == NULL) { 282 ex_emsg(sp, NULL, EXM_NOPREVRE); 283 return (1); 284 } 285 if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp, sp->re, 286 sp->re_len, NULL, NULL, &sp->re_c, RE_C_SEARCH)) 287 return (1); 288 return (s(sp, 289 cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->re_c, 0)); 290 } 291 292 /* 293 * s -- 294 * Do the substitution. This stuff is *really* tricky. There are lots of 295 * special cases, and general nastiness. Don't mess with it unless you're 296 * pretty confident. 297 * 298 * The nasty part of the substitution is what happens when the replacement 299 * string contains newlines. It's a bit tricky -- consider the information 300 * that has to be retained for "s/f\(o\)o/^M\1^M\1/". The solution here is 301 * to build a set of newline offsets which we use to break the line up later, 302 * when the replacement is done. Don't change it unless you're *damned* 303 * confident. 304 */ 305 #define NEEDNEWLINE(sp) { \ 306 if (sp->newl_len == sp->newl_cnt) { \ 307 sp->newl_len += 25; \ 308 REALLOC(sp, sp->newl, size_t *, \ 309 sp->newl_len * sizeof(size_t)); \ 310 if (sp->newl == NULL) { \ 311 sp->newl_len = 0; \ 312 return (1); \ 313 } \ 314 } \ 315 } 316 317 #define BUILD(sp, l, len) { \ 318 if (lbclen + (len) > lblen) { \ 319 lblen = p2roundup(MAX(lbclen + (len), 256)); \ 320 REALLOC(sp, lb, CHAR_T *, lblen * sizeof(CHAR_T)); \ 321 if (lb == NULL) { \ 322 lbclen = 0; \ 323 return (1); \ 324 } \ 325 } \ 326 MEMCPY(lb + lbclen, l, len); \ 327 lbclen += len; \ 328 } 329 330 #define NEEDSP(sp, len, pnt) { \ 331 if (lbclen + (len) > lblen) { \ 332 lblen = p2roundup(MAX(lbclen + (len), 256)); \ 333 REALLOC(sp, lb, CHAR_T *, lblen * sizeof(CHAR_T)); \ 334 if (lb == NULL) { \ 335 lbclen = 0; \ 336 return (1); \ 337 } \ 338 pnt = lb + lbclen; \ 339 } \ 340 } 341 342 static int 343 s(SCR *sp, EXCMD *cmdp, CHAR_T *s, regex_t *re, u_int flags) 344 { 345 EVENT ev; 346 MARK from, to; 347 TEXTH tiq[] = {{ 0 }}; 348 recno_t elno, lno, slno; 349 u_long ul; 350 regmatch_t match[10]; 351 size_t blen, cnt, last, lbclen, lblen, len, llen; 352 size_t offset, saved_offset, scno; 353 int cflag, lflag, nflag, pflag, rflag; 354 int didsub, do_eol_match, eflags, empty_ok, eval; 355 int linechanged, matched, quit, rval; 356 CHAR_T *bp, *lb; 357 enum nresult nret; 358 359 NEEDFILE(sp, cmdp); 360 361 slno = sp->lno; 362 scno = sp->cno; 363 364 /* 365 * !!! 366 * Historically, the 'g' and 'c' suffices were always toggled as flags, 367 * so ":s/A/B/" was the same as ":s/A/B/ccgg". If O_EDCOMPATIBLE was 368 * not set, they were initialized to 0 for all substitute commands. If 369 * O_EDCOMPATIBLE was set, they were initialized to 0 only if the user 370 * specified substitute/replacement patterns (see ex_s()). 371 */ 372 if (!O_ISSET(sp, O_EDCOMPATIBLE)) 373 sp->c_suffix = sp->g_suffix = 0; 374 375 /* 376 * Historic vi permitted the '#', 'l' and 'p' options in vi mode, but 377 * it only displayed the last change. I'd disallow them, but they are 378 * useful in combination with the [v]global commands. In the current 379 * model the problem is combining them with the 'c' flag -- the screen 380 * would have to flip back and forth between the confirm screen and the 381 * ex print screen, which would be pretty awful. We do display all 382 * changes, though, for what that's worth. 383 * 384 * !!! 385 * Historic vi was fairly strict about the order of "options", the 386 * count, and "flags". I'm somewhat fuzzy on the difference between 387 * options and flags, anyway, so this is a simpler approach, and we 388 * just take it them in whatever order the user gives them. (The ex 389 * usage statement doesn't reflect this.) 390 */ 391 cflag = lflag = nflag = pflag = rflag = 0; 392 if (s == NULL) 393 goto noargs; 394 for (lno = OOBLNO; *s != '\0'; ++s) 395 switch (*s) { 396 case ' ': 397 case '\t': 398 continue; 399 case '+': 400 ++cmdp->flagoff; 401 break; 402 case '-': 403 --cmdp->flagoff; 404 break; 405 case '0': case '1': case '2': case '3': case '4': 406 case '5': case '6': case '7': case '8': case '9': 407 if (lno != OOBLNO) 408 goto usage; 409 errno = 0; 410 nret = nget_uslong(&ul, s, &s, 10); 411 lno = ul; 412 if (*s == '\0') /* Loop increment correction. */ 413 --s; 414 if (nret != NUM_OK) { 415 if (nret == NUM_OVER) 416 msgq(sp, M_ERR, "153|Count overflow"); 417 else if (nret == NUM_UNDER) 418 msgq(sp, M_ERR, "154|Count underflow"); 419 else 420 msgq(sp, M_SYSERR, NULL); 421 return (1); 422 } 423 /* 424 * In historic vi, the count was inclusive from the 425 * second address. 426 */ 427 cmdp->addr1.lno = cmdp->addr2.lno; 428 cmdp->addr2.lno += lno - 1; 429 if (!db_exist(sp, cmdp->addr2.lno) && 430 db_last(sp, &cmdp->addr2.lno)) 431 return (1); 432 break; 433 case '#': 434 nflag = 1; 435 break; 436 case 'c': 437 sp->c_suffix = !sp->c_suffix; 438 439 /* Ex text structure initialization. */ 440 if (F_ISSET(sp, SC_EX)) 441 TAILQ_INIT(tiq); 442 break; 443 case 'g': 444 sp->g_suffix = !sp->g_suffix; 445 break; 446 case 'l': 447 lflag = 1; 448 break; 449 case 'p': 450 pflag = 1; 451 break; 452 case 'r': 453 if (LF_ISSET(SUB_FIRST)) { 454 msgq(sp, M_ERR, 455 "155|Regular expression specified; r flag meaningless"); 456 return (1); 457 } 458 if (!F_ISSET(sp, SC_RE_SEARCH)) { 459 ex_emsg(sp, NULL, EXM_NOPREVRE); 460 return (1); 461 } 462 rflag = 1; 463 re = &sp->re_c; 464 break; 465 default: 466 goto usage; 467 } 468 469 if (*s != '\0' || (!rflag && LF_ISSET(SUB_MUSTSETR))) { 470 usage: ex_emsg(sp, cmdp->cmd->usage, EXM_USAGE); 471 return (1); 472 } 473 474 noargs: if (F_ISSET(sp, SC_VI) && sp->c_suffix && (lflag || nflag || pflag)) { 475 msgq(sp, M_ERR, 476 "156|The #, l and p flags may not be combined with the c flag in vi mode"); 477 return (1); 478 } 479 480 /* 481 * bp: if interactive, line cache 482 * blen: if interactive, line cache length 483 * lb: build buffer pointer. 484 * lbclen: current length of built buffer. 485 * lblen; length of build buffer. 486 */ 487 bp = lb = NULL; 488 blen = lbclen = lblen = 0; 489 490 /* For each line... */ 491 lno = cmdp->addr1.lno == 0 ? 1 : cmdp->addr1.lno; 492 for (matched = quit = 0, 493 elno = cmdp->addr2.lno; !quit && lno <= elno; ++lno) { 494 495 /* Someone's unhappy, time to stop. */ 496 if (INTERRUPTED(sp)) 497 break; 498 499 /* Get the line. */ 500 if (db_get(sp, lno, DBG_FATAL, &s, &llen)) 501 goto err; 502 503 /* 504 * Make a local copy if doing confirmation -- when calling 505 * the confirm routine we're likely to lose the cached copy. 506 */ 507 if (sp->c_suffix) { 508 if (bp == NULL) { 509 GET_SPACE_RETW(sp, bp, blen, llen); 510 } else 511 ADD_SPACE_RETW(sp, bp, blen, llen); 512 MEMCPY(bp, s, llen); 513 s = bp; 514 } 515 516 /* Start searching from the beginning. */ 517 offset = 0; 518 len = llen; 519 520 /* Reset the build buffer offset. */ 521 lbclen = 0; 522 523 /* Reset empty match flag. */ 524 empty_ok = 1; 525 526 /* 527 * We don't want to have to do a setline if the line didn't 528 * change -- keep track of whether or not this line changed. 529 * If doing confirmations, don't want to keep setting the 530 * line if change is refused -- keep track of substitutions. 531 */ 532 didsub = linechanged = 0; 533 534 /* New line, do an EOL match. */ 535 do_eol_match = 1; 536 537 /* It's not nul terminated, but we pretend it is. */ 538 eflags = REG_STARTEND; 539 540 /* 541 * The search area is from s + offset to the EOL. 542 * 543 * Generally, match[0].rm_so is the offset of the start 544 * of the match from the start of the search, and offset 545 * is the offset of the start of the last search. 546 */ 547 nextmatch: match[0].rm_so = 0; 548 match[0].rm_eo = len; 549 550 /* Get the next match. */ 551 eval = regexec(re, s + offset, 10, match, eflags); 552 553 /* 554 * There wasn't a match or if there was an error, deal with 555 * it. If there was a previous match in this line, resolve 556 * the changes into the database. Otherwise, just move on. 557 */ 558 if (eval == REG_NOMATCH) 559 goto endmatch; 560 if (eval != 0) { 561 re_error(sp, eval, re); 562 goto err; 563 } 564 matched = 1; 565 566 /* Only the first search can match an anchored expression. */ 567 eflags |= REG_NOTBOL; 568 569 /* 570 * !!! 571 * It's possible to match 0-length strings -- for example, the 572 * command s;a*;X;, when matched against the string "aabb" will 573 * result in "XbXbX", i.e. the matches are "aa", the space 574 * between the b's and the space between the b's and the end of 575 * the string. There is a similar space between the beginning 576 * of the string and the a's. The rule that we use (because vi 577 * historically used it) is that any 0-length match, occurring 578 * immediately after a match, is ignored. Otherwise, the above 579 * example would have resulted in "XXbXbX". Another example is 580 * incorrectly using " *" to replace groups of spaces with one 581 * space. 582 * 583 * The way we do this is that if we just had a successful match, 584 * the starting offset does not skip characters, and the match 585 * is empty, ignore the match and move forward. If there's no 586 * more characters in the string, we were attempting to match 587 * after the last character, so quit. 588 */ 589 if (!empty_ok && match[0].rm_so == 0 && match[0].rm_eo == 0) { 590 empty_ok = 1; 591 if (len == 0) 592 goto endmatch; 593 BUILD(sp, s + offset, 1) 594 ++offset; 595 --len; 596 goto nextmatch; 597 } 598 599 /* Confirm change. */ 600 if (sp->c_suffix) { 601 /* 602 * Set the cursor position for confirmation. Note, 603 * if we matched on a '$', the cursor may be past 604 * the end of line. 605 */ 606 from.lno = to.lno = lno; 607 from.cno = match[0].rm_so + offset; 608 to.cno = match[0].rm_eo + offset; 609 /* 610 * Both ex and vi have to correct for a change before 611 * the first character in the line. 612 */ 613 if (llen == 0) 614 from.cno = to.cno = 0; 615 if (F_ISSET(sp, SC_VI)) { 616 /* 617 * Only vi has to correct for a change after 618 * the last character in the line. 619 * 620 * XXX 621 * It would be nice to change the vi code so 622 * that we could display a cursor past EOL. 623 */ 624 if (to.cno >= llen) 625 to.cno = llen - 1; 626 if (from.cno >= llen) 627 from.cno = llen - 1; 628 629 sp->lno = from.lno; 630 sp->cno = from.cno; 631 if (vs_refresh(sp, 1)) 632 goto err; 633 634 vs_update(sp, msg_cat(sp, 635 "169|Confirm change? [n]", NULL), NULL); 636 637 if (v_event_get(sp, &ev, 0, 0)) 638 goto err; 639 switch (ev.e_event) { 640 case E_CHARACTER: 641 break; 642 case E_EOF: 643 case E_ERR: 644 case E_INTERRUPT: 645 goto lquit; 646 default: 647 v_event_err(sp, &ev); 648 goto lquit; 649 } 650 } else { 651 if (ex_print(sp, cmdp, &from, &to, 0) || 652 ex_scprint(sp, &from, &to)) 653 goto lquit; 654 if (ex_txt(sp, tiq, 0, TXT_CR)) 655 goto err; 656 ev.e_c = TAILQ_FIRST(tiq)->lb[0]; 657 } 658 659 switch (ev.e_c) { 660 case CH_YES: 661 break; 662 default: 663 case CH_NO: 664 didsub = 0; 665 BUILD(sp, s +offset, match[0].rm_eo); 666 goto skip; 667 case CH_QUIT: 668 /* Set the quit/interrupted flags. */ 669 lquit: quit = 1; 670 F_SET(sp->gp, G_INTERRUPTED); 671 672 /* 673 * Resolve any changes, then return to (and 674 * exit from) the main loop. 675 */ 676 goto endmatch; 677 } 678 } 679 680 /* 681 * Set the cursor to the last position changed, converting 682 * from 1-based to 0-based. 683 */ 684 sp->lno = lno; 685 sp->cno = match[0].rm_so; 686 687 /* Copy the bytes before the match into the build buffer. */ 688 BUILD(sp, s + offset, match[0].rm_so); 689 690 /* Substitute the matching bytes. */ 691 didsub = 1; 692 if (re_sub(sp, s + offset, &lb, &lbclen, &lblen, match)) 693 goto err; 694 695 /* Set the change flag so we know this line was modified. */ 696 linechanged = 1; 697 698 /* Move past the matched bytes. */ 699 skip: offset += match[0].rm_eo; 700 len -= match[0].rm_eo; 701 702 /* A match cannot be followed by an empty pattern. */ 703 empty_ok = 0; 704 705 /* 706 * If doing a global change with confirmation, we have to 707 * update the screen. The basic idea is to store the line 708 * so the screen update routines can find it, and restart. 709 */ 710 if (didsub && sp->c_suffix && sp->g_suffix) { 711 /* 712 * The new search offset will be the end of the 713 * modified line. 714 */ 715 saved_offset = lbclen; 716 717 /* Copy the rest of the line. */ 718 if (len) 719 BUILD(sp, s + offset, len) 720 721 /* Set the new offset. */ 722 offset = saved_offset; 723 724 /* Store inserted lines, adjusting the build buffer. */ 725 last = 0; 726 if (sp->newl_cnt) { 727 for (cnt = 0; 728 cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) { 729 if (db_insert(sp, lno, 730 lb + last, sp->newl[cnt] - last)) 731 goto err; 732 last = sp->newl[cnt] + 1; 733 ++sp->rptlines[L_ADDED]; 734 } 735 lbclen -= last; 736 offset -= last; 737 sp->newl_cnt = 0; 738 } 739 740 /* Store and retrieve the line. */ 741 if (db_set(sp, lno, lb + last, lbclen)) 742 goto err; 743 if (db_get(sp, lno, DBG_FATAL, &s, &llen)) 744 goto err; 745 ADD_SPACE_RETW(sp, bp, blen, llen) 746 MEMCPY(bp, s, llen); 747 s = bp; 748 len = llen - offset; 749 750 /* Restart the build. */ 751 lbclen = 0; 752 BUILD(sp, s, offset); 753 754 /* 755 * If we haven't already done the after-the-string 756 * match, do one. Set REG_NOTEOL so the '$' pattern 757 * only matches once. 758 */ 759 if (!do_eol_match) 760 goto endmatch; 761 if (offset == len) { 762 do_eol_match = 0; 763 eflags |= REG_NOTEOL; 764 } 765 goto nextmatch; 766 } 767 768 /* 769 * If it's a global: 770 * 771 * If at the end of the string, do a test for the after 772 * the string match. Set REG_NOTEOL so the '$' pattern 773 * only matches once. 774 */ 775 if (sp->g_suffix && do_eol_match) { 776 if (len == 0) { 777 do_eol_match = 0; 778 eflags |= REG_NOTEOL; 779 } 780 goto nextmatch; 781 } 782 783 endmatch: if (!linechanged) 784 continue; 785 786 /* Copy any remaining bytes into the build buffer. */ 787 if (len) 788 BUILD(sp, s + offset, len) 789 790 /* Store inserted lines, adjusting the build buffer. */ 791 last = 0; 792 if (sp->newl_cnt) { 793 for (cnt = 0; 794 cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) { 795 if (db_insert(sp, 796 lno, lb + last, sp->newl[cnt] - last)) 797 goto err; 798 last = sp->newl[cnt] + 1; 799 ++sp->rptlines[L_ADDED]; 800 } 801 lbclen -= last; 802 sp->newl_cnt = 0; 803 } 804 805 /* Store the changed line. */ 806 if (db_set(sp, lno, lb + last, lbclen)) 807 goto err; 808 809 /* Update changed line counter. */ 810 if (sp->rptlchange != lno) { 811 sp->rptlchange = lno; 812 ++sp->rptlines[L_CHANGED]; 813 } 814 815 /* 816 * !!! 817 * Display as necessary. Historic practice is to only 818 * display the last line of a line split into multiple 819 * lines. 820 */ 821 if (lflag || nflag || pflag) { 822 from.lno = to.lno = lno; 823 from.cno = to.cno = 0; 824 if (lflag) 825 (void)ex_print(sp, cmdp, &from, &to, E_C_LIST); 826 if (nflag) 827 (void)ex_print(sp, cmdp, &from, &to, E_C_HASH); 828 if (pflag) 829 (void)ex_print(sp, cmdp, &from, &to, E_C_PRINT); 830 } 831 } 832 833 /* 834 * !!! 835 * Historically, vi attempted to leave the cursor at the same place if 836 * the substitution was done at the current cursor position. Otherwise 837 * it moved it to the first non-blank of the last line changed. There 838 * were some problems: for example, :s/$/foo/ with the cursor on the 839 * last character of the line left the cursor on the last character, or 840 * the & command with multiple occurrences of the matching string in the 841 * line usually left the cursor in a fairly random position. 842 * 843 * We try to do the same thing, with the exception that if the user is 844 * doing substitution with confirmation, we move to the last line about 845 * which the user was consulted, as opposed to the last line that they 846 * actually changed. This prevents a screen flash if the user doesn't 847 * change many of the possible lines. 848 */ 849 if (!sp->c_suffix && (sp->lno != slno || sp->cno != scno)) { 850 sp->cno = 0; 851 (void)nonblank(sp, sp->lno, &sp->cno); 852 } 853 854 /* 855 * If not in a global command, and nothing matched, say so. 856 * Else, if none of the lines displayed, put something up. 857 */ 858 rval = 0; 859 if (!matched) { 860 if (!F_ISSET(sp, SC_EX_GLOBAL)) { 861 msgq(sp, M_ERR, "157|No match found"); 862 goto err; 863 } 864 } else if (!lflag && !nflag && !pflag) 865 F_SET(cmdp, E_AUTOPRINT); 866 867 if (0) { 868 err: rval = 1; 869 } 870 871 if (bp != NULL) 872 FREE_SPACEW(sp, bp, blen); 873 if (lb != NULL) 874 free(lb); 875 return (rval); 876 } 877 878 /* 879 * re_compile -- 880 * Compile the RE. 881 * 882 * PUBLIC: int re_compile(SCR *, 883 * PUBLIC: CHAR_T *, size_t, CHAR_T **, size_t *, regex_t *, u_int); 884 */ 885 int 886 re_compile(SCR *sp, CHAR_T *ptrn, size_t plen, CHAR_T **ptrnp, size_t *lenp, regex_t *rep, u_int flags) 887 { 888 size_t len; 889 int reflags, replaced, rval; 890 CHAR_T *p; 891 892 /* Set RE flags. */ 893 reflags = 0; 894 if (!LF_ISSET(RE_C_CSCOPE | RE_C_TAG)) { 895 if (O_ISSET(sp, O_EXTENDED)) 896 reflags |= REG_EXTENDED; 897 if (O_ISSET(sp, O_IGNORECASE)) 898 reflags |= REG_ICASE; 899 if (O_ISSET(sp, O_ICLOWER)) { 900 for (p = ptrn, len = plen; len > 0; ++p, --len) 901 if (ISUPPER(*p)) 902 break; 903 if (len == 0) 904 reflags |= REG_ICASE; 905 } 906 } 907 908 /* If we're replacing a saved value, clear the old one. */ 909 if (LF_ISSET(RE_C_SEARCH) && F_ISSET(sp, SC_RE_SEARCH)) { 910 regfree(&sp->re_c); 911 F_CLR(sp, SC_RE_SEARCH); 912 } 913 if (LF_ISSET(RE_C_SUBST) && F_ISSET(sp, SC_RE_SUBST)) { 914 regfree(&sp->subre_c); 915 F_CLR(sp, SC_RE_SUBST); 916 } 917 918 /* 919 * If we're saving the string, it's a pattern we haven't seen before, 920 * so convert the vi-style RE's to POSIX 1003.2 RE's. Save a copy for 921 * later recompilation. Free any previously saved value. 922 */ 923 if (ptrnp != NULL) { 924 replaced = 0; 925 if (LF_ISSET(RE_C_CSCOPE)) { 926 if (re_cscope_conv(sp, &ptrn, &plen, &replaced)) 927 return (1); 928 /* 929 * XXX 930 * Currently, the match-any-<blank> expression used in 931 * re_cscope_conv() requires extended RE's. This may 932 * not be right or safe. 933 */ 934 reflags |= REG_EXTENDED; 935 } else if (LF_ISSET(RE_C_TAG)) { 936 if (re_tag_conv(sp, &ptrn, &plen, &replaced)) 937 return (1); 938 } else 939 if (re_conv(sp, &ptrn, &plen, &replaced)) 940 return (1); 941 942 /* Discard previous pattern. */ 943 if (*ptrnp != NULL) { 944 free(*ptrnp); 945 *ptrnp = NULL; 946 } 947 if (lenp != NULL) 948 *lenp = plen; 949 950 /* 951 * Copy the string into allocated memory. 952 * 953 * XXX 954 * Regcomp isn't 8-bit clean, so the pattern is nul-terminated 955 * for now. There's just no other solution. 956 */ 957 MALLOC(sp, *ptrnp, CHAR_T *, (plen + 1) * sizeof(CHAR_T)); 958 if (*ptrnp != NULL) { 959 MEMCPY(*ptrnp, ptrn, plen); 960 (*ptrnp)[plen] = '\0'; 961 } 962 963 /* Free up conversion-routine-allocated memory. */ 964 if (replaced) 965 FREE_SPACEW(sp, ptrn, 0); 966 967 if (*ptrnp == NULL) 968 return (1); 969 970 ptrn = *ptrnp; 971 } 972 973 /* 974 * XXX 975 * Regcomp isn't 8-bit clean, so we just lost if the pattern 976 * contained a nul. Bummer! 977 */ 978 if ((rval = regcomp(rep, ptrn, /* plen, */ reflags)) != 0) { 979 if (!LF_ISSET(RE_C_SILENT)) 980 re_error(sp, rval, rep); 981 return (1); 982 } 983 984 if (LF_ISSET(RE_C_SEARCH)) 985 F_SET(sp, SC_RE_SEARCH); 986 if (LF_ISSET(RE_C_SUBST)) 987 F_SET(sp, SC_RE_SUBST); 988 989 return (0); 990 } 991 992 /* 993 * re_conv -- 994 * Convert vi's regular expressions into something that the 995 * the POSIX 1003.2 RE functions can handle. 996 * 997 * There are three conversions we make to make vi's RE's (specifically 998 * the global, search, and substitute patterns) work with POSIX RE's. 999 * 1000 * 1: If O_MAGIC is not set, strip backslashes from the magic character 1001 * set (.[*~) that have them, and add them to the ones that don't. 1002 * 2: If O_MAGIC is not set, the string "\~" is replaced with the text 1003 * from the last substitute command's replacement string. If O_MAGIC 1004 * is set, it's the string "~". 1005 * 3: The pattern \<ptrn\> does "word" searches, convert it to use the 1006 * new RE escapes. 1007 * 1008 * !!!/XXX 1009 * This doesn't exactly match the historic behavior of vi because we do 1010 * the ~ substitution before calling the RE engine, so magic characters 1011 * in the replacement string will be expanded by the RE engine, and they 1012 * weren't historically. It's a bug. 1013 */ 1014 static int 1015 re_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp) 1016 { 1017 size_t blen, len, needlen; 1018 int magic; 1019 CHAR_T *bp, *p, *t; 1020 1021 /* 1022 * First pass through, we figure out how much space we'll need. 1023 * We do it in two passes, on the grounds that most of the time 1024 * the user is doing a search and won't have magic characters. 1025 * That way we can skip most of the memory allocation and copies. 1026 */ 1027 magic = 0; 1028 for (p = *ptrnp, len = *plenp, needlen = 0; len > 0; ++p, --len) 1029 switch (*p) { 1030 case '\\': 1031 if (len > 1) { 1032 --len; 1033 switch (*++p) { 1034 case '<': 1035 magic = 1; 1036 needlen += RE_WSTART_LEN + 1; 1037 break; 1038 case '>': 1039 magic = 1; 1040 needlen += RE_WSTOP_LEN + 1; 1041 break; 1042 case '~': 1043 if (!O_ISSET(sp, O_MAGIC)) { 1044 magic = 1; 1045 needlen += sp->repl_len; 1046 } 1047 break; 1048 case '.': 1049 case '[': 1050 case '*': 1051 if (!O_ISSET(sp, O_MAGIC)) { 1052 magic = 1; 1053 needlen += 1; 1054 } 1055 break; 1056 default: 1057 needlen += 2; 1058 } 1059 } else 1060 needlen += 1; 1061 break; 1062 case '~': 1063 if (O_ISSET(sp, O_MAGIC)) { 1064 magic = 1; 1065 needlen += sp->repl_len; 1066 } 1067 break; 1068 case '.': 1069 case '[': 1070 case '*': 1071 if (!O_ISSET(sp, O_MAGIC)) { 1072 magic = 1; 1073 needlen += 2; 1074 } 1075 break; 1076 default: 1077 needlen += 1; 1078 break; 1079 } 1080 1081 if (!magic) { 1082 *replacedp = 0; 1083 return (0); 1084 } 1085 1086 /* Get enough memory to hold the final pattern. */ 1087 *replacedp = 1; 1088 GET_SPACE_RETW(sp, bp, blen, needlen); 1089 1090 for (p = *ptrnp, len = *plenp, t = bp; len > 0; ++p, --len) 1091 switch (*p) { 1092 case '\\': 1093 if (len > 1) { 1094 --len; 1095 switch (*++p) { 1096 case '<': 1097 MEMCPY(t, 1098 RE_WSTART, RE_WSTART_LEN); 1099 t += RE_WSTART_LEN; 1100 break; 1101 case '>': 1102 MEMCPY(t, 1103 RE_WSTOP, RE_WSTOP_LEN); 1104 t += RE_WSTOP_LEN; 1105 break; 1106 case '~': 1107 if (O_ISSET(sp, O_MAGIC)) 1108 *t++ = '~'; 1109 else { 1110 MEMCPY(t, 1111 sp->repl, sp->repl_len); 1112 t += sp->repl_len; 1113 } 1114 break; 1115 case '.': 1116 case '[': 1117 case '*': 1118 if (O_ISSET(sp, O_MAGIC)) 1119 *t++ = '\\'; 1120 *t++ = *p; 1121 break; 1122 default: 1123 *t++ = '\\'; 1124 *t++ = *p; 1125 } 1126 } else 1127 *t++ = '\\'; 1128 break; 1129 case '~': 1130 if (O_ISSET(sp, O_MAGIC)) { 1131 MEMCPY(t, sp->repl, sp->repl_len); 1132 t += sp->repl_len; 1133 } else 1134 *t++ = '~'; 1135 break; 1136 case '.': 1137 case '[': 1138 case '*': 1139 if (!O_ISSET(sp, O_MAGIC)) 1140 *t++ = '\\'; 1141 *t++ = *p; 1142 break; 1143 default: 1144 *t++ = *p; 1145 break; 1146 } 1147 1148 *ptrnp = bp; 1149 *plenp = t - bp; 1150 return (0); 1151 } 1152 1153 /* 1154 * re_tag_conv -- 1155 * Convert a tags search path into something that the POSIX 1156 * 1003.2 RE functions can handle. 1157 */ 1158 static int 1159 re_tag_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp) 1160 { 1161 size_t blen, len; 1162 int lastdollar; 1163 CHAR_T *bp, *p, *t; 1164 1165 len = *plenp; 1166 1167 /* Max memory usage is 2 times the length of the string. */ 1168 *replacedp = 1; 1169 GET_SPACE_RETW(sp, bp, blen, len * 2); 1170 1171 p = *ptrnp; 1172 t = bp; 1173 1174 /* If the last character is a '/' or '?', we just strip it. */ 1175 if (len > 0 && (p[len - 1] == '/' || p[len - 1] == '?')) 1176 --len; 1177 1178 /* If the next-to-last or last character is a '$', it's magic. */ 1179 if (len > 0 && p[len - 1] == '$') { 1180 --len; 1181 lastdollar = 1; 1182 } else 1183 lastdollar = 0; 1184 1185 /* If the first character is a '/' or '?', we just strip it. */ 1186 if (len > 0 && (p[0] == '/' || p[0] == '?')) { 1187 ++p; 1188 --len; 1189 } 1190 1191 /* If the first or second character is a '^', it's magic. */ 1192 if (p[0] == '^') { 1193 *t++ = *p++; 1194 --len; 1195 } 1196 1197 /* 1198 * Escape every other magic character we can find, meanwhile stripping 1199 * the backslashes ctags inserts when escaping the search delimiter 1200 * characters. 1201 */ 1202 for (; len > 0; --len) { 1203 if (p[0] == '\\' && (p[1] == '/' || p[1] == '?')) { 1204 ++p; 1205 --len; 1206 } else if (STRCHR(L("^.[]$*"), p[0])) 1207 *t++ = '\\'; 1208 *t++ = *p++; 1209 } 1210 if (lastdollar) 1211 *t++ = '$'; 1212 1213 *ptrnp = bp; 1214 *plenp = t - bp; 1215 return (0); 1216 } 1217 1218 /* 1219 * re_cscope_conv -- 1220 * Convert a cscope search path into something that the POSIX 1221 * 1003.2 RE functions can handle. 1222 */ 1223 static int 1224 re_cscope_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp) 1225 { 1226 size_t blen, len, nspaces; 1227 CHAR_T *bp, *t; 1228 CHAR_T *p; 1229 CHAR_T *wp; 1230 size_t wlen; 1231 1232 /* 1233 * Each space in the source line printed by cscope represents an 1234 * arbitrary sequence of spaces, tabs, and comments. 1235 */ 1236 #define CSCOPE_RE_SPACE "([ \t]|/\\*([^*]|\\*/)*\\*/)*" 1237 #define CSCOPE_LEN sizeof(CSCOPE_RE_SPACE) - 1 1238 CHAR2INT(sp, CSCOPE_RE_SPACE, CSCOPE_LEN, wp, wlen); 1239 for (nspaces = 0, p = *ptrnp, len = *plenp; len > 0; ++p, --len) 1240 if (*p == ' ') 1241 ++nspaces; 1242 1243 /* 1244 * Allocate plenty of space: 1245 * the string, plus potential escaping characters; 1246 * nspaces + 2 copies of CSCOPE_RE_SPACE; 1247 * ^, $, nul terminator characters. 1248 */ 1249 *replacedp = 1; 1250 len = (p - *ptrnp) * 2 + (nspaces + 2) * sizeof(CSCOPE_RE_SPACE) + 3; 1251 GET_SPACE_RETW(sp, bp, blen, len); 1252 1253 p = *ptrnp; 1254 t = bp; 1255 1256 *t++ = '^'; 1257 MEMCPY(t, wp, wlen); 1258 t += wlen; 1259 1260 for (len = *plenp; len > 0; ++p, --len) 1261 if (*p == ' ') { 1262 MEMCPY(t, wp, wlen); 1263 t += wlen; 1264 } else { 1265 if (STRCHR(L("\\^.[]$*+?()|{}"), *p)) 1266 *t++ = '\\'; 1267 *t++ = *p; 1268 } 1269 1270 MEMCPY(t, wp, wlen); 1271 t += wlen; 1272 *t++ = '$'; 1273 1274 *ptrnp = bp; 1275 *plenp = t - bp; 1276 return (0); 1277 } 1278 1279 /* 1280 * re_error -- 1281 * Report a regular expression error. 1282 * 1283 * PUBLIC: void re_error(SCR *, int, regex_t *); 1284 */ 1285 void 1286 re_error(SCR *sp, int errcode, regex_t *preg) 1287 { 1288 size_t s; 1289 char *oe; 1290 1291 s = regerror(errcode, preg, "", 0); 1292 MALLOC(sp, oe, char *, s); 1293 if (oe != NULL) { 1294 (void)regerror(errcode, preg, oe, s); 1295 msgq(sp, M_ERR, "RE error: %s", oe); 1296 free(oe); 1297 } 1298 } 1299 1300 /* 1301 * re_sub -- 1302 * Do the substitution for a regular expression. 1303 */ 1304 static int 1305 re_sub( 1306 SCR *sp, 1307 CHAR_T *ip, /* Input line. */ 1308 CHAR_T **lbp, 1309 size_t *lbclenp, 1310 size_t *lblenp, 1311 regmatch_t match[10]) 1312 { 1313 enum { C_NOTSET, C_LOWER, C_ONELOWER, C_ONEUPPER, C_UPPER } conv; 1314 size_t lbclen, lblen; /* Local copies. */ 1315 size_t mlen; /* Match length. */ 1316 size_t rpl; /* Remaining replacement length. */ 1317 CHAR_T *rp; /* Replacement pointer. */ 1318 int ch; 1319 int no; /* Match replacement offset. */ 1320 CHAR_T *p, *t; /* Buffer pointers. */ 1321 CHAR_T *lb; /* Local copies. */ 1322 1323 lb = *lbp; /* Get local copies. */ 1324 lbclen = *lbclenp; 1325 lblen = *lblenp; 1326 1327 /* 1328 * QUOTING NOTE: 1329 * 1330 * There are some special sequences that vi provides in the 1331 * replacement patterns. 1332 * & string the RE matched (\& if nomagic set) 1333 * \# n-th regular subexpression 1334 * \E end \U, \L conversion 1335 * \e end \U, \L conversion 1336 * \l convert the next character to lower-case 1337 * \L convert to lower-case, until \E, \e, or end of replacement 1338 * \u convert the next character to upper-case 1339 * \U convert to upper-case, until \E, \e, or end of replacement 1340 * 1341 * Otherwise, since this is the lowest level of replacement, discard 1342 * all escaping characters. This (hopefully) matches historic practice. 1343 */ 1344 #define OUTCH(ch, nltrans) { \ 1345 ARG_CHAR_T __ch = (ch); \ 1346 e_key_t __value = KEY_VAL(sp, __ch); \ 1347 if (nltrans && (__value == K_CR || __value == K_NL)) { \ 1348 NEEDNEWLINE(sp); \ 1349 sp->newl[sp->newl_cnt++] = lbclen; \ 1350 } else if (conv != C_NOTSET) { \ 1351 switch (conv) { \ 1352 case C_ONELOWER: \ 1353 conv = C_NOTSET; \ 1354 /* FALLTHROUGH */ \ 1355 case C_LOWER: \ 1356 if (ISUPPER(__ch)) \ 1357 __ch = TOLOWER(__ch); \ 1358 break; \ 1359 case C_ONEUPPER: \ 1360 conv = C_NOTSET; \ 1361 /* FALLTHROUGH */ \ 1362 case C_UPPER: \ 1363 if (ISLOWER(__ch)) \ 1364 __ch = TOUPPER(__ch); \ 1365 break; \ 1366 default: \ 1367 abort(); \ 1368 } \ 1369 } \ 1370 NEEDSP(sp, 1, p); \ 1371 *p++ = __ch; \ 1372 ++lbclen; \ 1373 } 1374 conv = C_NOTSET; 1375 for (rp = sp->repl, rpl = sp->repl_len, p = lb + lbclen; rpl--;) { 1376 switch (ch = *rp++) { 1377 case '&': 1378 if (O_ISSET(sp, O_MAGIC)) { 1379 no = 0; 1380 goto subzero; 1381 } 1382 break; 1383 case '\\': 1384 if (rpl == 0) 1385 break; 1386 --rpl; 1387 switch (ch = *rp) { 1388 case '&': 1389 ++rp; 1390 if (!O_ISSET(sp, O_MAGIC)) { 1391 no = 0; 1392 goto subzero; 1393 } 1394 break; 1395 case '0': case '1': case '2': case '3': case '4': 1396 case '5': case '6': case '7': case '8': case '9': 1397 no = *rp++ - '0'; 1398 subzero: if (match[no].rm_so == -1 || 1399 match[no].rm_eo == -1) 1400 break; 1401 mlen = match[no].rm_eo - match[no].rm_so; 1402 for (t = ip + match[no].rm_so; mlen--; ++t) 1403 OUTCH(*t, 0); 1404 continue; 1405 case 'e': 1406 case 'E': 1407 ++rp; 1408 conv = C_NOTSET; 1409 continue; 1410 case 'l': 1411 ++rp; 1412 conv = C_ONELOWER; 1413 continue; 1414 case 'L': 1415 ++rp; 1416 conv = C_LOWER; 1417 continue; 1418 case 'u': 1419 ++rp; 1420 conv = C_ONEUPPER; 1421 continue; 1422 case 'U': 1423 ++rp; 1424 conv = C_UPPER; 1425 continue; 1426 case '\r': 1427 OUTCH(ch, 0); 1428 continue; 1429 default: 1430 ++rp; 1431 break; 1432 } 1433 } 1434 OUTCH(ch, 1); 1435 } 1436 1437 *lbp = lb; /* Update caller's information. */ 1438 *lbclenp = lbclen; 1439 *lblenp = lblen; 1440 return (0); 1441 } 1442