1 /*- 2 * Copyright (c) 1992, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * Copyright (c) 1992, 1993, 1994, 1995, 1996 5 * Keith Bostic. All rights reserved. 6 * 7 * See the LICENSE file for redistribution information. 8 */ 9 10 #include "config.h" 11 12 #include <sys/types.h> 13 #include <sys/queue.h> 14 #include <sys/time.h> 15 16 #include <bitstring.h> 17 #include <ctype.h> 18 #include <errno.h> 19 #include <limits.h> 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include <string.h> 23 #include <unistd.h> 24 25 #include "../common/common.h" 26 #include "../vi/vi.h" 27 28 #define SUB_FIRST 0x01 /* The 'r' flag isn't reasonable. */ 29 #define SUB_MUSTSETR 0x02 /* The 'r' flag is required. */ 30 31 static int re_conv(SCR *, CHAR_T **, size_t *, int *); 32 static int re_cscope_conv(SCR *, CHAR_T **, size_t *, int *); 33 static int re_sub(SCR *, 34 CHAR_T *, CHAR_T **, size_t *, size_t *, regmatch_t [10]); 35 static int re_tag_conv(SCR *, CHAR_T **, size_t *, int *); 36 static int s(SCR *, EXCMD *, CHAR_T *, regex_t *, u_int); 37 38 /* 39 * ex_s -- 40 * [line [,line]] s[ubstitute] [[/;]pat[/;]/repl[/;] [cgr] [count] [#lp]] 41 * 42 * Substitute on lines matching a pattern. 43 * 44 * PUBLIC: int ex_s(SCR *, EXCMD *); 45 */ 46 int 47 ex_s(SCR *sp, EXCMD *cmdp) 48 { 49 regex_t *re; 50 size_t blen, len; 51 u_int flags; 52 int delim; 53 CHAR_T *bp, *p, *ptrn, *rep, *t; 54 55 /* 56 * Skip leading white space. 57 * 58 * !!! 59 * Historic vi allowed any non-alphanumeric to serve as the 60 * substitution command delimiter. 61 * 62 * !!! 63 * If the arguments are empty, it's the same as &, i.e. we 64 * repeat the last substitution. 65 */ 66 if (cmdp->argc == 0) 67 goto subagain; 68 for (p = cmdp->argv[0]->bp, 69 len = cmdp->argv[0]->len; len > 0; --len, ++p) { 70 if (!cmdskip(*p)) 71 break; 72 } 73 if (len == 0) 74 subagain: return (ex_subagain(sp, cmdp)); 75 76 delim = *p++; 77 if (is09azAZ(delim) || delim == '\\') 78 return (s(sp, cmdp, p, &sp->subre_c, SUB_MUSTSETR)); 79 80 /* 81 * !!! 82 * The full-blown substitute command reset the remembered 83 * state of the 'c' and 'g' suffices. 84 */ 85 sp->c_suffix = sp->g_suffix = 0; 86 87 /* 88 * Get the pattern string, toss escaping characters. 89 * 90 * !!! 91 * Historic vi accepted any of the following forms: 92 * 93 * :s/abc/def/ change "abc" to "def" 94 * :s/abc/def change "abc" to "def" 95 * :s/abc/ delete "abc" 96 * :s/abc delete "abc" 97 * 98 * QUOTING NOTE: 99 * 100 * Only toss an escaping character if it escapes a delimiter. 101 * This means that "s/A/\\\\f" replaces "A" with "\\f". It 102 * would be nice to be more regular, i.e. for each layer of 103 * escaping a single escaping character is removed, but that's 104 * not how the historic vi worked. 105 */ 106 for (ptrn = t = p;;) { 107 if (p[0] == '\0' || p[0] == delim) { 108 if (p[0] == delim) 109 ++p; 110 /* 111 * !!! 112 * Nul terminate the pattern string -- it's passed 113 * to regcomp which doesn't understand anything else. 114 */ 115 *t = '\0'; 116 break; 117 } 118 if (p[0] == '\\') 119 if (p[1] == delim) 120 ++p; 121 else if (p[1] == '\\') 122 *t++ = *p++; 123 *t++ = *p++; 124 } 125 126 /* 127 * If the pattern string is empty, use the last RE (not just the 128 * last substitution RE). 129 */ 130 if (*ptrn == '\0') { 131 if (sp->re == NULL) { 132 ex_emsg(sp, NULL, EXM_NOPREVRE); 133 return (1); 134 } 135 136 /* Re-compile the RE if necessary. */ 137 if (!F_ISSET(sp, SC_RE_SEARCH) && 138 re_compile(sp, sp->re, sp->re_len, 139 NULL, NULL, &sp->re_c, RE_C_SEARCH)) 140 return (1); 141 flags = 0; 142 } else { 143 /* 144 * !!! 145 * Compile the RE. Historic practice is that substitutes set 146 * the search direction as well as both substitute and search 147 * RE's. We compile the RE twice, as we don't want to bother 148 * ref counting the pattern string and (opaque) structure. 149 */ 150 if (re_compile(sp, ptrn, t - ptrn, &sp->re, 151 &sp->re_len, &sp->re_c, RE_C_SEARCH)) 152 return (1); 153 if (re_compile(sp, ptrn, t - ptrn, &sp->subre, 154 &sp->subre_len, &sp->subre_c, RE_C_SUBST)) 155 return (1); 156 157 flags = SUB_FIRST; 158 sp->searchdir = FORWARD; 159 } 160 re = &sp->re_c; 161 162 /* 163 * Get the replacement string. 164 * 165 * The special character & (\& if O_MAGIC not set) matches the 166 * entire RE. No handling of & is required here, it's done by 167 * re_sub(). 168 * 169 * The special character ~ (\~ if O_MAGIC not set) inserts the 170 * previous replacement string into this replacement string. 171 * Count ~'s to figure out how much space we need. We could 172 * special case nonexistent last patterns or whether or not 173 * O_MAGIC is set, but it's probably not worth the effort. 174 * 175 * QUOTING NOTE: 176 * 177 * Only toss an escaping character if it escapes a delimiter or 178 * if O_MAGIC is set and it escapes a tilde. 179 * 180 * !!! 181 * If the entire replacement pattern is "%", then use the last 182 * replacement pattern. This semantic was added to vi in System 183 * V and then percolated elsewhere, presumably around the time 184 * that it was added to their version of ed(1). 185 */ 186 if (p[0] == '\0' || p[0] == delim) { 187 if (p[0] == delim) 188 ++p; 189 free(sp->repl); 190 sp->repl = NULL; 191 sp->repl_len = 0; 192 } else if (p[0] == '%' && (p[1] == '\0' || p[1] == delim)) 193 p += p[1] == delim ? 2 : 1; 194 else { 195 for (rep = p, len = 0; 196 p[0] != '\0' && p[0] != delim; ++p, ++len) 197 if (p[0] == '~') 198 len += sp->repl_len; 199 GET_SPACE_RETW(sp, bp, blen, len); 200 for (t = bp, len = 0, p = rep;;) { 201 if (p[0] == '\0' || p[0] == delim) { 202 if (p[0] == delim) 203 ++p; 204 break; 205 } 206 if (p[0] == '\\') { 207 if (p[1] == delim) 208 ++p; 209 else if (p[1] == '\\') { 210 *t++ = *p++; 211 ++len; 212 } else if (p[1] == '~') { 213 ++p; 214 if (!O_ISSET(sp, O_MAGIC)) 215 goto tilde; 216 } 217 } else if (p[0] == '~' && O_ISSET(sp, O_MAGIC)) { 218 tilde: ++p; 219 MEMCPY(t, sp->repl, sp->repl_len); 220 t += sp->repl_len; 221 len += sp->repl_len; 222 continue; 223 } 224 *t++ = *p++; 225 ++len; 226 } 227 if ((sp->repl_len = len) != 0) { 228 free(sp->repl); 229 MALLOC(sp, sp->repl, len * sizeof(CHAR_T)); 230 if (sp->repl == NULL) { 231 FREE_SPACEW(sp, bp, blen); 232 return (1); 233 } 234 MEMCPY(sp->repl, bp, len); 235 } 236 FREE_SPACEW(sp, bp, blen); 237 } 238 return (s(sp, cmdp, p, re, flags)); 239 } 240 241 /* 242 * ex_subagain -- 243 * [line [,line]] & [cgr] [count] [#lp]] 244 * 245 * Substitute using the last substitute RE and replacement pattern. 246 * 247 * PUBLIC: int ex_subagain(SCR *, EXCMD *); 248 */ 249 int 250 ex_subagain(SCR *sp, EXCMD *cmdp) 251 { 252 if (sp->subre == NULL) { 253 ex_emsg(sp, NULL, EXM_NOPREVRE); 254 return (1); 255 } 256 if (!F_ISSET(sp, SC_RE_SUBST) && 257 re_compile(sp, sp->subre, sp->subre_len, 258 NULL, NULL, &sp->subre_c, RE_C_SUBST)) 259 return (1); 260 return (s(sp, 261 cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->subre_c, 0)); 262 } 263 264 /* 265 * ex_subtilde -- 266 * [line [,line]] ~ [cgr] [count] [#lp]] 267 * 268 * Substitute using the last RE and last substitute replacement pattern. 269 * 270 * PUBLIC: int ex_subtilde(SCR *, EXCMD *); 271 */ 272 int 273 ex_subtilde(SCR *sp, EXCMD *cmdp) 274 { 275 if (sp->re == NULL) { 276 ex_emsg(sp, NULL, EXM_NOPREVRE); 277 return (1); 278 } 279 if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp, sp->re, 280 sp->re_len, NULL, NULL, &sp->re_c, RE_C_SEARCH)) 281 return (1); 282 return (s(sp, 283 cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->re_c, 0)); 284 } 285 286 /* 287 * s -- 288 * Do the substitution. This stuff is *really* tricky. There are lots of 289 * special cases, and general nastiness. Don't mess with it unless you're 290 * pretty confident. 291 * 292 * The nasty part of the substitution is what happens when the replacement 293 * string contains newlines. It's a bit tricky -- consider the information 294 * that has to be retained for "s/f\(o\)o/^M\1^M\1/". The solution here is 295 * to build a set of newline offsets which we use to break the line up later, 296 * when the replacement is done. Don't change it unless you're *damned* 297 * confident. 298 */ 299 #define NEEDNEWLINE(sp) { \ 300 if (sp->newl_len == sp->newl_cnt) { \ 301 sp->newl_len += 25; \ 302 REALLOC(sp, sp->newl, size_t *, \ 303 sp->newl_len * sizeof(size_t)); \ 304 if (sp->newl == NULL) { \ 305 sp->newl_len = 0; \ 306 return (1); \ 307 } \ 308 } \ 309 } 310 311 #define BUILD(sp, l, len) { \ 312 if (lbclen + (len) > lblen) { \ 313 lblen = p2roundup(MAX(lbclen + (len), 256)); \ 314 REALLOC(sp, lb, CHAR_T *, lblen * sizeof(CHAR_T)); \ 315 if (lb == NULL) { \ 316 lbclen = 0; \ 317 return (1); \ 318 } \ 319 } \ 320 MEMCPY(lb + lbclen, l, len); \ 321 lbclen += len; \ 322 } 323 324 #define NEEDSP(sp, len, pnt) { \ 325 if (lbclen + (len) > lblen) { \ 326 lblen = p2roundup(MAX(lbclen + (len), 256)); \ 327 REALLOC(sp, lb, CHAR_T *, lblen * sizeof(CHAR_T)); \ 328 if (lb == NULL) { \ 329 lbclen = 0; \ 330 return (1); \ 331 } \ 332 pnt = lb + lbclen; \ 333 } \ 334 } 335 336 static int 337 s(SCR *sp, EXCMD *cmdp, CHAR_T *s, regex_t *re, u_int flags) 338 { 339 EVENT ev; 340 MARK from, to; 341 TEXTH tiq[] = {{ 0 }}; 342 recno_t elno, lno, slno; 343 u_long ul; 344 regmatch_t match[10]; 345 size_t blen, cnt, last, lbclen, lblen, len, llen; 346 size_t offset, saved_offset, scno; 347 int cflag, lflag, nflag, pflag, rflag; 348 int didsub, do_eol_match, eflags, empty_ok, eval; 349 int linechanged, matched, quit, rval; 350 CHAR_T *bp, *lb; 351 enum nresult nret; 352 353 NEEDFILE(sp, cmdp); 354 355 slno = sp->lno; 356 scno = sp->cno; 357 358 /* 359 * !!! 360 * Historically, the 'g' and 'c' suffices were always toggled as flags, 361 * so ":s/A/B/" was the same as ":s/A/B/ccgg". If O_EDCOMPATIBLE was 362 * not set, they were initialized to 0 for all substitute commands. If 363 * O_EDCOMPATIBLE was set, they were initialized to 0 only if the user 364 * specified substitute/replacement patterns (see ex_s()). 365 */ 366 if (!O_ISSET(sp, O_EDCOMPATIBLE)) 367 sp->c_suffix = sp->g_suffix = 0; 368 369 /* 370 * Historic vi permitted the '#', 'l' and 'p' options in vi mode, but 371 * it only displayed the last change. I'd disallow them, but they are 372 * useful in combination with the [v]global commands. In the current 373 * model the problem is combining them with the 'c' flag -- the screen 374 * would have to flip back and forth between the confirm screen and the 375 * ex print screen, which would be pretty awful. We do display all 376 * changes, though, for what that's worth. 377 * 378 * !!! 379 * Historic vi was fairly strict about the order of "options", the 380 * count, and "flags". I'm somewhat fuzzy on the difference between 381 * options and flags, anyway, so this is a simpler approach, and we 382 * just take it them in whatever order the user gives them. (The ex 383 * usage statement doesn't reflect this.) 384 */ 385 cflag = lflag = nflag = pflag = rflag = 0; 386 if (s == NULL) 387 goto noargs; 388 for (lno = OOBLNO; *s != '\0'; ++s) 389 switch (*s) { 390 case ' ': 391 case '\t': 392 continue; 393 case '+': 394 ++cmdp->flagoff; 395 break; 396 case '-': 397 --cmdp->flagoff; 398 break; 399 case '0': case '1': case '2': case '3': case '4': 400 case '5': case '6': case '7': case '8': case '9': 401 if (lno != OOBLNO) 402 goto usage; 403 errno = 0; 404 nret = nget_uslong(&ul, s, &s, 10); 405 lno = ul; 406 if (*s == '\0') /* Loop increment correction. */ 407 --s; 408 if (nret != NUM_OK) { 409 if (nret == NUM_OVER) 410 msgq(sp, M_ERR, "153|Count overflow"); 411 else if (nret == NUM_UNDER) 412 msgq(sp, M_ERR, "154|Count underflow"); 413 else 414 msgq(sp, M_SYSERR, NULL); 415 return (1); 416 } 417 /* 418 * In historic vi, the count was inclusive from the 419 * second address. 420 */ 421 cmdp->addr1.lno = cmdp->addr2.lno; 422 cmdp->addr2.lno += lno - 1; 423 if (!db_exist(sp, cmdp->addr2.lno) && 424 db_last(sp, &cmdp->addr2.lno)) 425 return (1); 426 break; 427 case '#': 428 nflag = 1; 429 break; 430 case 'c': 431 sp->c_suffix = !sp->c_suffix; 432 433 /* Ex text structure initialization. */ 434 if (F_ISSET(sp, SC_EX)) 435 TAILQ_INIT(tiq); 436 break; 437 case 'g': 438 sp->g_suffix = !sp->g_suffix; 439 break; 440 case 'l': 441 lflag = 1; 442 break; 443 case 'p': 444 pflag = 1; 445 break; 446 case 'r': 447 if (LF_ISSET(SUB_FIRST)) { 448 msgq(sp, M_ERR, 449 "155|Regular expression specified; r flag meaningless"); 450 return (1); 451 } 452 if (!F_ISSET(sp, SC_RE_SEARCH)) { 453 ex_emsg(sp, NULL, EXM_NOPREVRE); 454 return (1); 455 } 456 rflag = 1; 457 re = &sp->re_c; 458 break; 459 default: 460 goto usage; 461 } 462 463 if (*s != '\0' || (!rflag && LF_ISSET(SUB_MUSTSETR))) { 464 usage: ex_emsg(sp, cmdp->cmd->usage, EXM_USAGE); 465 return (1); 466 } 467 468 noargs: if (F_ISSET(sp, SC_VI) && sp->c_suffix && (lflag || nflag || pflag)) { 469 msgq(sp, M_ERR, 470 "156|The #, l and p flags may not be combined with the c flag in vi mode"); 471 return (1); 472 } 473 474 /* 475 * bp: if interactive, line cache 476 * blen: if interactive, line cache length 477 * lb: build buffer pointer. 478 * lbclen: current length of built buffer. 479 * lblen; length of build buffer. 480 */ 481 bp = lb = NULL; 482 blen = lbclen = lblen = 0; 483 484 /* For each line... */ 485 lno = cmdp->addr1.lno == 0 ? 1 : cmdp->addr1.lno; 486 for (matched = quit = 0, 487 elno = cmdp->addr2.lno; !quit && lno <= elno; ++lno) { 488 489 /* Someone's unhappy, time to stop. */ 490 if (INTERRUPTED(sp)) 491 break; 492 493 /* Get the line. */ 494 if (db_get(sp, lno, DBG_FATAL, &s, &llen)) 495 goto err; 496 497 /* 498 * Make a local copy if doing confirmation -- when calling 499 * the confirm routine we're likely to lose the cached copy. 500 */ 501 if (sp->c_suffix) { 502 if (bp == NULL) { 503 GET_SPACE_RETW(sp, bp, blen, llen); 504 } else 505 ADD_SPACE_RETW(sp, bp, blen, llen); 506 MEMCPY(bp, s, llen); 507 s = bp; 508 } 509 510 /* Start searching from the beginning. */ 511 offset = 0; 512 len = llen; 513 514 /* Reset the build buffer offset. */ 515 lbclen = 0; 516 517 /* Reset empty match flag. */ 518 empty_ok = 1; 519 520 /* 521 * We don't want to have to do a setline if the line didn't 522 * change -- keep track of whether or not this line changed. 523 * If doing confirmations, don't want to keep setting the 524 * line if change is refused -- keep track of substitutions. 525 */ 526 didsub = linechanged = 0; 527 528 /* New line, do an EOL match. */ 529 do_eol_match = 1; 530 531 /* It's not nul terminated, but we pretend it is. */ 532 eflags = REG_STARTEND; 533 534 /* 535 * The search area is from s + offset to the EOL. 536 * 537 * Generally, match[0].rm_so is the offset of the start 538 * of the match from the start of the search, and offset 539 * is the offset of the start of the last search. 540 */ 541 nextmatch: match[0].rm_so = 0; 542 match[0].rm_eo = len; 543 544 /* Get the next match. */ 545 eval = regexec(re, s + offset, 10, match, eflags); 546 547 /* 548 * There wasn't a match or if there was an error, deal with 549 * it. If there was a previous match in this line, resolve 550 * the changes into the database. Otherwise, just move on. 551 */ 552 if (eval == REG_NOMATCH) 553 goto endmatch; 554 if (eval != 0) { 555 re_error(sp, eval, re); 556 goto err; 557 } 558 matched = 1; 559 560 /* Only the first search can match an anchored expression. */ 561 eflags |= REG_NOTBOL; 562 563 /* 564 * !!! 565 * It's possible to match 0-length strings -- for example, the 566 * command s;a*;X;, when matched against the string "aabb" will 567 * result in "XbXbX", i.e. the matches are "aa", the space 568 * between the b's and the space between the b's and the end of 569 * the string. There is a similar space between the beginning 570 * of the string and the a's. The rule that we use (because vi 571 * historically used it) is that any 0-length match, occurring 572 * immediately after a match, is ignored. Otherwise, the above 573 * example would have resulted in "XXbXbX". Another example is 574 * incorrectly using " *" to replace groups of spaces with one 575 * space. 576 * 577 * The way we do this is that if we just had a successful match, 578 * the starting offset does not skip characters, and the match 579 * is empty, ignore the match and move forward. If there's no 580 * more characters in the string, we were attempting to match 581 * after the last character, so quit. 582 */ 583 if (!empty_ok && match[0].rm_so == 0 && match[0].rm_eo == 0) { 584 empty_ok = 1; 585 if (len == 0) 586 goto endmatch; 587 BUILD(sp, s + offset, 1) 588 ++offset; 589 --len; 590 goto nextmatch; 591 } 592 593 /* Confirm change. */ 594 if (sp->c_suffix) { 595 /* 596 * Set the cursor position for confirmation. Note, 597 * if we matched on a '$', the cursor may be past 598 * the end of line. 599 */ 600 from.lno = to.lno = lno; 601 from.cno = match[0].rm_so + offset; 602 to.cno = match[0].rm_eo + offset; 603 /* 604 * Both ex and vi have to correct for a change before 605 * the first character in the line. 606 */ 607 if (llen == 0) 608 from.cno = to.cno = 0; 609 if (F_ISSET(sp, SC_VI)) { 610 /* 611 * Only vi has to correct for a change after 612 * the last character in the line. 613 * 614 * XXX 615 * It would be nice to change the vi code so 616 * that we could display a cursor past EOL. 617 */ 618 if (to.cno >= llen) 619 to.cno = llen - 1; 620 if (from.cno >= llen) 621 from.cno = llen - 1; 622 623 sp->lno = from.lno; 624 sp->cno = from.cno; 625 if (vs_refresh(sp, 1)) 626 goto err; 627 628 vs_update(sp, msg_cat(sp, 629 "169|Confirm change? [n]", NULL), NULL); 630 631 if (v_event_get(sp, &ev, 0, 0)) 632 goto err; 633 switch (ev.e_event) { 634 case E_CHARACTER: 635 break; 636 case E_EOF: 637 case E_ERR: 638 case E_INTERRUPT: 639 goto lquit; 640 default: 641 v_event_err(sp, &ev); 642 goto lquit; 643 } 644 } else { 645 if (ex_print(sp, cmdp, &from, &to, 0) || 646 ex_scprint(sp, &from, &to)) 647 goto lquit; 648 if (ex_txt(sp, tiq, 0, TXT_CR)) 649 goto err; 650 ev.e_c = TAILQ_FIRST(tiq)->lb[0]; 651 } 652 653 switch (ev.e_c) { 654 case CH_YES: 655 break; 656 default: 657 case CH_NO: 658 didsub = 0; 659 BUILD(sp, s +offset, match[0].rm_eo); 660 goto skip; 661 case CH_QUIT: 662 /* Set the quit/interrupted flags. */ 663 lquit: quit = 1; 664 F_SET(sp->gp, G_INTERRUPTED); 665 666 /* 667 * Resolve any changes, then return to (and 668 * exit from) the main loop. 669 */ 670 goto endmatch; 671 } 672 } 673 674 /* 675 * Set the cursor to the last position changed, converting 676 * from 1-based to 0-based. 677 */ 678 sp->lno = lno; 679 sp->cno = match[0].rm_so; 680 681 /* Copy the bytes before the match into the build buffer. */ 682 BUILD(sp, s + offset, match[0].rm_so); 683 684 /* Substitute the matching bytes. */ 685 didsub = 1; 686 if (re_sub(sp, s + offset, &lb, &lbclen, &lblen, match)) 687 goto err; 688 689 /* Set the change flag so we know this line was modified. */ 690 linechanged = 1; 691 692 /* Move past the matched bytes. */ 693 skip: offset += match[0].rm_eo; 694 len -= match[0].rm_eo; 695 696 /* A match cannot be followed by an empty pattern. */ 697 empty_ok = 0; 698 699 /* 700 * If doing a global change with confirmation, we have to 701 * update the screen. The basic idea is to store the line 702 * so the screen update routines can find it, and restart. 703 */ 704 if (didsub && sp->c_suffix && sp->g_suffix) { 705 /* 706 * The new search offset will be the end of the 707 * modified line. 708 */ 709 saved_offset = lbclen; 710 711 /* Copy the rest of the line. */ 712 if (len) 713 BUILD(sp, s + offset, len) 714 715 /* Set the new offset. */ 716 offset = saved_offset; 717 718 /* Store inserted lines, adjusting the build buffer. */ 719 last = 0; 720 if (sp->newl_cnt) { 721 for (cnt = 0; 722 cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) { 723 if (db_insert(sp, lno, 724 lb + last, sp->newl[cnt] - last)) 725 goto err; 726 last = sp->newl[cnt] + 1; 727 ++sp->rptlines[L_ADDED]; 728 } 729 lbclen -= last; 730 offset -= last; 731 sp->newl_cnt = 0; 732 } 733 734 /* Store and retrieve the line. */ 735 if (db_set(sp, lno, lb + last, lbclen)) 736 goto err; 737 if (db_get(sp, lno, DBG_FATAL, &s, &llen)) 738 goto err; 739 ADD_SPACE_RETW(sp, bp, blen, llen) 740 MEMCPY(bp, s, llen); 741 s = bp; 742 len = llen - offset; 743 744 /* Restart the build. */ 745 lbclen = 0; 746 BUILD(sp, s, offset); 747 748 /* 749 * If we haven't already done the after-the-string 750 * match, do one. Set REG_NOTEOL so the '$' pattern 751 * only matches once. 752 */ 753 if (!do_eol_match) 754 goto endmatch; 755 if (offset == len) { 756 do_eol_match = 0; 757 eflags |= REG_NOTEOL; 758 } 759 goto nextmatch; 760 } 761 762 /* 763 * If it's a global: 764 * 765 * If at the end of the string, do a test for the after 766 * the string match. Set REG_NOTEOL so the '$' pattern 767 * only matches once. 768 */ 769 if (sp->g_suffix && do_eol_match) { 770 if (len == 0) { 771 do_eol_match = 0; 772 eflags |= REG_NOTEOL; 773 } 774 goto nextmatch; 775 } 776 777 endmatch: if (!linechanged) 778 continue; 779 780 /* Copy any remaining bytes into the build buffer. */ 781 if (len) 782 BUILD(sp, s + offset, len) 783 784 /* Store inserted lines, adjusting the build buffer. */ 785 last = 0; 786 if (sp->newl_cnt) { 787 for (cnt = 0; 788 cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) { 789 if (db_insert(sp, 790 lno, lb + last, sp->newl[cnt] - last)) 791 goto err; 792 last = sp->newl[cnt] + 1; 793 ++sp->rptlines[L_ADDED]; 794 } 795 lbclen -= last; 796 sp->newl_cnt = 0; 797 } 798 799 /* Store the changed line. */ 800 if (db_set(sp, lno, lb + last, lbclen)) 801 goto err; 802 803 /* Update changed line counter. */ 804 if (sp->rptlchange != lno) { 805 sp->rptlchange = lno; 806 ++sp->rptlines[L_CHANGED]; 807 } 808 809 /* 810 * !!! 811 * Display as necessary. Historic practice is to only 812 * display the last line of a line split into multiple 813 * lines. 814 */ 815 if (lflag || nflag || pflag) { 816 from.lno = to.lno = lno; 817 from.cno = to.cno = 0; 818 if (lflag) 819 (void)ex_print(sp, cmdp, &from, &to, E_C_LIST); 820 if (nflag) 821 (void)ex_print(sp, cmdp, &from, &to, E_C_HASH); 822 if (pflag) 823 (void)ex_print(sp, cmdp, &from, &to, E_C_PRINT); 824 } 825 } 826 827 /* 828 * !!! 829 * Historically, vi attempted to leave the cursor at the same place if 830 * the substitution was done at the current cursor position. Otherwise 831 * it moved it to the first non-blank of the last line changed. There 832 * were some problems: for example, :s/$/foo/ with the cursor on the 833 * last character of the line left the cursor on the last character, or 834 * the & command with multiple occurrences of the matching string in the 835 * line usually left the cursor in a fairly random position. 836 * 837 * We try to do the same thing, with the exception that if the user is 838 * doing substitution with confirmation, we move to the last line about 839 * which the user was consulted, as opposed to the last line that they 840 * actually changed. This prevents a screen flash if the user doesn't 841 * change many of the possible lines. 842 */ 843 if (!sp->c_suffix && (sp->lno != slno || sp->cno != scno)) { 844 sp->cno = 0; 845 (void)nonblank(sp, sp->lno, &sp->cno); 846 } 847 848 /* 849 * If not in a global command, and nothing matched, say so. 850 * Else, if none of the lines displayed, put something up. 851 */ 852 rval = 0; 853 if (!matched) { 854 if (!F_ISSET(sp, SC_EX_GLOBAL)) { 855 msgq(sp, M_ERR, "157|No match found"); 856 goto err; 857 } 858 } else if (!lflag && !nflag && !pflag) 859 F_SET(cmdp, E_AUTOPRINT); 860 861 if (0) { 862 err: rval = 1; 863 } 864 865 if (bp != NULL) 866 FREE_SPACEW(sp, bp, blen); 867 free(lb); 868 return (rval); 869 } 870 871 /* 872 * re_compile -- 873 * Compile the RE. 874 * 875 * PUBLIC: int re_compile(SCR *, 876 * PUBLIC: CHAR_T *, size_t, CHAR_T **, size_t *, regex_t *, u_int); 877 */ 878 int 879 re_compile(SCR *sp, CHAR_T *ptrn, size_t plen, CHAR_T **ptrnp, size_t *lenp, regex_t *rep, u_int flags) 880 { 881 size_t len; 882 int reflags, replaced, rval; 883 CHAR_T *p; 884 885 /* Set RE flags. */ 886 reflags = 0; 887 if (!LF_ISSET(RE_C_CSCOPE | RE_C_TAG)) { 888 if (O_ISSET(sp, O_EXTENDED)) 889 reflags |= REG_EXTENDED; 890 if (O_ISSET(sp, O_IGNORECASE)) 891 reflags |= REG_ICASE; 892 if (O_ISSET(sp, O_ICLOWER)) { 893 for (p = ptrn, len = plen; len > 0; ++p, --len) 894 if (ISUPPER(*p)) 895 break; 896 if (len == 0) 897 reflags |= REG_ICASE; 898 } 899 } 900 901 /* If we're replacing a saved value, clear the old one. */ 902 if (LF_ISSET(RE_C_SEARCH) && F_ISSET(sp, SC_RE_SEARCH)) { 903 regfree(&sp->re_c); 904 F_CLR(sp, SC_RE_SEARCH); 905 } 906 if (LF_ISSET(RE_C_SUBST) && F_ISSET(sp, SC_RE_SUBST)) { 907 regfree(&sp->subre_c); 908 F_CLR(sp, SC_RE_SUBST); 909 } 910 911 /* 912 * If we're saving the string, it's a pattern we haven't seen before, 913 * so convert the vi-style RE's to POSIX 1003.2 RE's. Save a copy for 914 * later recompilation. Free any previously saved value. 915 */ 916 if (ptrnp != NULL) { 917 replaced = 0; 918 if (LF_ISSET(RE_C_CSCOPE)) { 919 if (re_cscope_conv(sp, &ptrn, &plen, &replaced)) 920 return (1); 921 /* 922 * XXX 923 * Currently, the match-any-<blank> expression used in 924 * re_cscope_conv() requires extended RE's. This may 925 * not be right or safe. 926 */ 927 reflags |= REG_EXTENDED; 928 } else if (LF_ISSET(RE_C_TAG)) { 929 if (re_tag_conv(sp, &ptrn, &plen, &replaced)) 930 return (1); 931 } else 932 if (re_conv(sp, &ptrn, &plen, &replaced)) 933 return (1); 934 935 /* Discard previous pattern. */ 936 free(*ptrnp); 937 *ptrnp = NULL; 938 939 if (lenp != NULL) 940 *lenp = plen; 941 942 /* 943 * Copy the string into allocated memory. 944 * 945 * XXX 946 * Regcomp isn't 8-bit clean, so the pattern is nul-terminated 947 * for now. There's just no other solution. 948 */ 949 MALLOC(sp, *ptrnp, (plen + 1) * sizeof(CHAR_T)); 950 if (*ptrnp != NULL) { 951 MEMCPY(*ptrnp, ptrn, plen); 952 (*ptrnp)[plen] = '\0'; 953 } 954 955 /* Free up conversion-routine-allocated memory. */ 956 if (replaced) 957 FREE_SPACEW(sp, ptrn, 0); 958 959 if (*ptrnp == NULL) 960 return (1); 961 962 ptrn = *ptrnp; 963 } 964 965 /* 966 * XXX 967 * Regcomp isn't 8-bit clean, so we just lost if the pattern 968 * contained a nul. Bummer! 969 */ 970 if ((rval = regcomp(rep, ptrn, /* plen, */ reflags)) != 0) { 971 if (!LF_ISSET(RE_C_SILENT)) 972 re_error(sp, rval, rep); 973 return (1); 974 } 975 976 if (LF_ISSET(RE_C_SEARCH)) 977 F_SET(sp, SC_RE_SEARCH); 978 if (LF_ISSET(RE_C_SUBST)) 979 F_SET(sp, SC_RE_SUBST); 980 981 return (0); 982 } 983 984 /* 985 * re_conv -- 986 * Convert vi's regular expressions into something that the 987 * the POSIX 1003.2 RE functions can handle. 988 * 989 * There are three conversions we make to make vi's RE's (specifically 990 * the global, search, and substitute patterns) work with POSIX RE's. 991 * 992 * 1: If O_MAGIC is not set, strip backslashes from the magic character 993 * set (.[*~) that have them, and add them to the ones that don't. 994 * 2: If O_MAGIC is not set, the string "\~" is replaced with the text 995 * from the last substitute command's replacement string. If O_MAGIC 996 * is set, it's the string "~". 997 * 3: The pattern \<ptrn\> does "word" searches, convert it to use the 998 * new RE escapes. 999 * 1000 * !!!/XXX 1001 * This doesn't exactly match the historic behavior of vi because we do 1002 * the ~ substitution before calling the RE engine, so magic characters 1003 * in the replacement string will be expanded by the RE engine, and they 1004 * weren't historically. It's a bug. 1005 */ 1006 static int 1007 re_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp) 1008 { 1009 size_t blen, len, needlen; 1010 int magic; 1011 CHAR_T *bp, *p, *t; 1012 1013 /* 1014 * First pass through, we figure out how much space we'll need. 1015 * We do it in two passes, on the grounds that most of the time 1016 * the user is doing a search and won't have magic characters. 1017 * That way we can skip most of the memory allocation and copies. 1018 */ 1019 magic = 0; 1020 for (p = *ptrnp, len = *plenp, needlen = 0; len > 0; ++p, --len) 1021 switch (*p) { 1022 case '\\': 1023 if (len > 1) { 1024 --len; 1025 switch (*++p) { 1026 case '<': 1027 magic = 1; 1028 needlen += RE_WSTART_LEN + 1; 1029 break; 1030 case '>': 1031 magic = 1; 1032 needlen += RE_WSTOP_LEN + 1; 1033 break; 1034 case '~': 1035 if (!O_ISSET(sp, O_MAGIC)) { 1036 magic = 1; 1037 needlen += sp->repl_len; 1038 } 1039 break; 1040 case '.': 1041 case '[': 1042 case '*': 1043 if (!O_ISSET(sp, O_MAGIC)) { 1044 magic = 1; 1045 needlen += 1; 1046 } 1047 break; 1048 default: 1049 needlen += 2; 1050 } 1051 } else 1052 needlen += 1; 1053 break; 1054 case '~': 1055 if (O_ISSET(sp, O_MAGIC)) { 1056 magic = 1; 1057 needlen += sp->repl_len; 1058 } 1059 break; 1060 case '.': 1061 case '[': 1062 case '*': 1063 if (!O_ISSET(sp, O_MAGIC)) { 1064 magic = 1; 1065 needlen += 2; 1066 } 1067 break; 1068 default: 1069 needlen += 1; 1070 break; 1071 } 1072 1073 if (!magic) { 1074 *replacedp = 0; 1075 return (0); 1076 } 1077 1078 /* Get enough memory to hold the final pattern. */ 1079 *replacedp = 1; 1080 GET_SPACE_RETW(sp, bp, blen, needlen); 1081 1082 for (p = *ptrnp, len = *plenp, t = bp; len > 0; ++p, --len) 1083 switch (*p) { 1084 case '\\': 1085 if (len > 1) { 1086 --len; 1087 switch (*++p) { 1088 case '<': 1089 MEMCPY(t, 1090 RE_WSTART, RE_WSTART_LEN); 1091 t += RE_WSTART_LEN; 1092 break; 1093 case '>': 1094 MEMCPY(t, 1095 RE_WSTOP, RE_WSTOP_LEN); 1096 t += RE_WSTOP_LEN; 1097 break; 1098 case '~': 1099 if (O_ISSET(sp, O_MAGIC)) 1100 *t++ = '~'; 1101 else { 1102 MEMCPY(t, 1103 sp->repl, sp->repl_len); 1104 t += sp->repl_len; 1105 } 1106 break; 1107 case '.': 1108 case '[': 1109 case '*': 1110 if (O_ISSET(sp, O_MAGIC)) 1111 *t++ = '\\'; 1112 *t++ = *p; 1113 break; 1114 default: 1115 *t++ = '\\'; 1116 *t++ = *p; 1117 } 1118 } else 1119 *t++ = '\\'; 1120 break; 1121 case '~': 1122 if (O_ISSET(sp, O_MAGIC)) { 1123 MEMCPY(t, sp->repl, sp->repl_len); 1124 t += sp->repl_len; 1125 } else 1126 *t++ = '~'; 1127 break; 1128 case '.': 1129 case '[': 1130 case '*': 1131 if (!O_ISSET(sp, O_MAGIC)) 1132 *t++ = '\\'; 1133 *t++ = *p; 1134 break; 1135 default: 1136 *t++ = *p; 1137 break; 1138 } 1139 1140 *ptrnp = bp; 1141 *plenp = t - bp; 1142 return (0); 1143 } 1144 1145 /* 1146 * re_tag_conv -- 1147 * Convert a tags search path into something that the POSIX 1148 * 1003.2 RE functions can handle. 1149 */ 1150 static int 1151 re_tag_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp) 1152 { 1153 size_t blen, len; 1154 int lastdollar; 1155 CHAR_T *bp, *p, *t; 1156 1157 len = *plenp; 1158 1159 /* Max memory usage is 2 times the length of the string. */ 1160 *replacedp = 1; 1161 GET_SPACE_RETW(sp, bp, blen, len * 2); 1162 1163 p = *ptrnp; 1164 t = bp; 1165 1166 /* If the last character is a '/' or '?', we just strip it. */ 1167 if (len > 0 && (p[len - 1] == '/' || p[len - 1] == '?')) 1168 --len; 1169 1170 /* If the next-to-last or last character is a '$', it's magic. */ 1171 if (len > 0 && p[len - 1] == '$') { 1172 --len; 1173 lastdollar = 1; 1174 } else 1175 lastdollar = 0; 1176 1177 /* If the first character is a '/' or '?', we just strip it. */ 1178 if (len > 0 && (p[0] == '/' || p[0] == '?')) { 1179 ++p; 1180 --len; 1181 } 1182 1183 /* If the first or second character is a '^', it's magic. */ 1184 if (p[0] == '^') { 1185 *t++ = *p++; 1186 --len; 1187 } 1188 1189 /* 1190 * Escape every other magic character we can find, meanwhile stripping 1191 * the backslashes ctags inserts when escaping the search delimiter 1192 * characters. 1193 */ 1194 for (; len > 0; --len) { 1195 if (p[0] == '\\' && (p[1] == '/' || p[1] == '?')) { 1196 ++p; 1197 --len; 1198 } else if (STRCHR(L("^.[]$*"), p[0])) 1199 *t++ = '\\'; 1200 *t++ = *p++; 1201 } 1202 if (lastdollar) 1203 *t++ = '$'; 1204 1205 *ptrnp = bp; 1206 *plenp = t - bp; 1207 return (0); 1208 } 1209 1210 /* 1211 * re_cscope_conv -- 1212 * Convert a cscope search path into something that the POSIX 1213 * 1003.2 RE functions can handle. 1214 */ 1215 static int 1216 re_cscope_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp) 1217 { 1218 size_t blen, len, nspaces; 1219 CHAR_T *bp, *t; 1220 CHAR_T *p; 1221 CHAR_T *wp; 1222 size_t wlen; 1223 1224 /* 1225 * Each space in the source line printed by cscope represents an 1226 * arbitrary sequence of spaces, tabs, and comments. 1227 */ 1228 #define CSCOPE_RE_SPACE "([ \t]|/\\*([^*]|\\*/)*\\*/)*" 1229 #define CSCOPE_LEN sizeof(CSCOPE_RE_SPACE) - 1 1230 CHAR2INT(sp, CSCOPE_RE_SPACE, CSCOPE_LEN, wp, wlen); 1231 for (nspaces = 0, p = *ptrnp, len = *plenp; len > 0; ++p, --len) 1232 if (*p == ' ') 1233 ++nspaces; 1234 1235 /* 1236 * Allocate plenty of space: 1237 * the string, plus potential escaping characters; 1238 * nspaces + 2 copies of CSCOPE_RE_SPACE; 1239 * ^, $, nul terminator characters. 1240 */ 1241 *replacedp = 1; 1242 len = (p - *ptrnp) * 2 + (nspaces + 2) * sizeof(CSCOPE_RE_SPACE) + 3; 1243 GET_SPACE_RETW(sp, bp, blen, len); 1244 1245 p = *ptrnp; 1246 t = bp; 1247 1248 *t++ = '^'; 1249 MEMCPY(t, wp, wlen); 1250 t += wlen; 1251 1252 for (len = *plenp; len > 0; ++p, --len) 1253 if (*p == ' ') { 1254 MEMCPY(t, wp, wlen); 1255 t += wlen; 1256 } else { 1257 if (STRCHR(L("\\^.[]$*+?()|{}"), *p)) 1258 *t++ = '\\'; 1259 *t++ = *p; 1260 } 1261 1262 MEMCPY(t, wp, wlen); 1263 t += wlen; 1264 *t++ = '$'; 1265 1266 *ptrnp = bp; 1267 *plenp = t - bp; 1268 return (0); 1269 } 1270 1271 /* 1272 * re_error -- 1273 * Report a regular expression error. 1274 * 1275 * PUBLIC: void re_error(SCR *, int, regex_t *); 1276 */ 1277 void 1278 re_error(SCR *sp, int errcode, regex_t *preg) 1279 { 1280 size_t s; 1281 char *oe; 1282 1283 s = regerror(errcode, preg, "", 0); 1284 MALLOC(sp, oe, s); 1285 if (oe != NULL) { 1286 (void)regerror(errcode, preg, oe, s); 1287 msgq(sp, M_ERR, "RE error: %s", oe); 1288 free(oe); 1289 } 1290 } 1291 1292 /* 1293 * re_sub -- 1294 * Do the substitution for a regular expression. 1295 */ 1296 static int 1297 re_sub( 1298 SCR *sp, 1299 CHAR_T *ip, /* Input line. */ 1300 CHAR_T **lbp, 1301 size_t *lbclenp, 1302 size_t *lblenp, 1303 regmatch_t match[10]) 1304 { 1305 enum { C_NOTSET, C_LOWER, C_ONELOWER, C_ONEUPPER, C_UPPER } conv; 1306 size_t lbclen, lblen; /* Local copies. */ 1307 size_t mlen; /* Match length. */ 1308 size_t rpl; /* Remaining replacement length. */ 1309 CHAR_T *rp; /* Replacement pointer. */ 1310 int ch; 1311 int no; /* Match replacement offset. */ 1312 CHAR_T *p, *t; /* Buffer pointers. */ 1313 CHAR_T *lb; /* Local copies. */ 1314 1315 lb = *lbp; /* Get local copies. */ 1316 lbclen = *lbclenp; 1317 lblen = *lblenp; 1318 1319 /* 1320 * QUOTING NOTE: 1321 * 1322 * There are some special sequences that vi provides in the 1323 * replacement patterns. 1324 * & string the RE matched (\& if nomagic set) 1325 * \# n-th regular subexpression 1326 * \E end \U, \L conversion 1327 * \e end \U, \L conversion 1328 * \l convert the next character to lower-case 1329 * \L convert to lower-case, until \E, \e, or end of replacement 1330 * \u convert the next character to upper-case 1331 * \U convert to upper-case, until \E, \e, or end of replacement 1332 * 1333 * Otherwise, since this is the lowest level of replacement, discard 1334 * all escaping characters. This (hopefully) matches historic practice. 1335 */ 1336 #define OUTCH(ch, nltrans) { \ 1337 ARG_CHAR_T __ch = (ch); \ 1338 e_key_t __value = KEY_VAL(sp, __ch); \ 1339 if (nltrans && (__value == K_CR || __value == K_NL)) { \ 1340 NEEDNEWLINE(sp); \ 1341 sp->newl[sp->newl_cnt++] = lbclen; \ 1342 } else if (conv != C_NOTSET) { \ 1343 switch (conv) { \ 1344 case C_ONELOWER: \ 1345 conv = C_NOTSET; \ 1346 /* FALLTHROUGH */ \ 1347 case C_LOWER: \ 1348 if (ISUPPER(__ch)) \ 1349 __ch = TOLOWER(__ch); \ 1350 break; \ 1351 case C_ONEUPPER: \ 1352 conv = C_NOTSET; \ 1353 /* FALLTHROUGH */ \ 1354 case C_UPPER: \ 1355 if (ISLOWER(__ch)) \ 1356 __ch = TOUPPER(__ch); \ 1357 break; \ 1358 default: \ 1359 abort(); \ 1360 } \ 1361 } \ 1362 NEEDSP(sp, 1, p); \ 1363 *p++ = __ch; \ 1364 ++lbclen; \ 1365 } 1366 conv = C_NOTSET; 1367 for (rp = sp->repl, rpl = sp->repl_len, p = lb + lbclen; rpl--;) { 1368 switch (ch = *rp++) { 1369 case '&': 1370 if (O_ISSET(sp, O_MAGIC)) { 1371 no = 0; 1372 goto subzero; 1373 } 1374 break; 1375 case '\\': 1376 if (rpl == 0) 1377 break; 1378 --rpl; 1379 switch (ch = *rp) { 1380 case '&': 1381 ++rp; 1382 if (!O_ISSET(sp, O_MAGIC)) { 1383 no = 0; 1384 goto subzero; 1385 } 1386 break; 1387 case '0': case '1': case '2': case '3': case '4': 1388 case '5': case '6': case '7': case '8': case '9': 1389 no = *rp++ - '0'; 1390 subzero: if (match[no].rm_so == -1 || 1391 match[no].rm_eo == -1) 1392 break; 1393 mlen = match[no].rm_eo - match[no].rm_so; 1394 for (t = ip + match[no].rm_so; mlen--; ++t) 1395 OUTCH(*t, 0); 1396 continue; 1397 case 'e': 1398 case 'E': 1399 ++rp; 1400 conv = C_NOTSET; 1401 continue; 1402 case 'l': 1403 ++rp; 1404 conv = C_ONELOWER; 1405 continue; 1406 case 'L': 1407 ++rp; 1408 conv = C_LOWER; 1409 continue; 1410 case 'u': 1411 ++rp; 1412 conv = C_ONEUPPER; 1413 continue; 1414 case 'U': 1415 ++rp; 1416 conv = C_UPPER; 1417 continue; 1418 case '\r': 1419 OUTCH(ch, 0); 1420 continue; 1421 default: 1422 ++rp; 1423 break; 1424 } 1425 } 1426 OUTCH(ch, 1); 1427 } 1428 1429 *lbp = lb; /* Update caller's information. */ 1430 *lbclenp = lbclen; 1431 *lblenp = lblen; 1432 return (0); 1433 } 1434