1 /*-
2 * Copyright (c) 1992, 1993, 1994
3 * The Regents of the University of California. All rights reserved.
4 * Copyright (c) 1992, 1993, 1994, 1995, 1996
5 * Keith Bostic. All rights reserved.
6 *
7 * See the LICENSE file for redistribution information.
8 */
9
10 #include "config.h"
11
12 #include <sys/types.h>
13 #include <sys/queue.h>
14 #include <sys/time.h>
15
16 #include <bitstring.h>
17 #include <ctype.h>
18 #include <errno.h>
19 #include <limits.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <unistd.h>
24
25 #include "../common/common.h"
26 #include "../vi/vi.h"
27
28 #define SUB_FIRST 0x01 /* The 'r' flag isn't reasonable. */
29 #define SUB_MUSTSETR 0x02 /* The 'r' flag is required. */
30
31 static int re_conv(SCR *, CHAR_T **, size_t *, int *);
32 static int re_cscope_conv(SCR *, CHAR_T **, size_t *, int *);
33 static int re_sub(SCR *,
34 CHAR_T *, CHAR_T **, size_t *, size_t *, regmatch_t [10]);
35 static int re_tag_conv(SCR *, CHAR_T **, size_t *, int *);
36 static int s(SCR *, EXCMD *, CHAR_T *, regex_t *, u_int);
37
38 /*
39 * ex_s --
40 * [line [,line]] s[ubstitute] [[/;]pat[/;]/repl[/;] [cgr] [count] [#lp]]
41 *
42 * Substitute on lines matching a pattern.
43 *
44 * PUBLIC: int ex_s(SCR *, EXCMD *);
45 */
46 int
ex_s(SCR * sp,EXCMD * cmdp)47 ex_s(SCR *sp, EXCMD *cmdp)
48 {
49 regex_t *re;
50 size_t blen, len;
51 u_int flags;
52 int delim;
53 CHAR_T *bp, *p, *ptrn, *rep, *t;
54
55 /*
56 * Skip leading white space.
57 *
58 * !!!
59 * Historic vi allowed any non-alphanumeric to serve as the
60 * substitution command delimiter.
61 *
62 * !!!
63 * If the arguments are empty, it's the same as &, i.e. we
64 * repeat the last substitution.
65 */
66 if (cmdp->argc == 0)
67 goto subagain;
68 for (p = cmdp->argv[0]->bp,
69 len = cmdp->argv[0]->len; len > 0; --len, ++p) {
70 if (!cmdskip(*p))
71 break;
72 }
73 if (len == 0)
74 subagain: return (ex_subagain(sp, cmdp));
75
76 delim = *p++;
77 if (is09azAZ(delim) || delim == '\\')
78 return (s(sp, cmdp, p, &sp->subre_c, SUB_MUSTSETR));
79
80 /*
81 * !!!
82 * The full-blown substitute command reset the remembered
83 * state of the 'c' and 'g' suffices.
84 */
85 sp->c_suffix = sp->g_suffix = 0;
86
87 /*
88 * Get the pattern string, toss escaping characters.
89 *
90 * !!!
91 * Historic vi accepted any of the following forms:
92 *
93 * :s/abc/def/ change "abc" to "def"
94 * :s/abc/def change "abc" to "def"
95 * :s/abc/ delete "abc"
96 * :s/abc delete "abc"
97 *
98 * QUOTING NOTE:
99 *
100 * Only toss an escaping character if it escapes a delimiter.
101 * This means that "s/A/\\\\f" replaces "A" with "\\f". It
102 * would be nice to be more regular, i.e. for each layer of
103 * escaping a single escaping character is removed, but that's
104 * not how the historic vi worked.
105 */
106 for (ptrn = t = p;;) {
107 if (p[0] == '\0' || p[0] == delim) {
108 if (p[0] == delim)
109 ++p;
110 /*
111 * !!!
112 * Nul terminate the pattern string -- it's passed
113 * to regcomp which doesn't understand anything else.
114 */
115 *t = '\0';
116 break;
117 }
118 if (p[0] == '\\') {
119 if (p[1] == delim)
120 ++p;
121 else if (p[1] == '\\')
122 *t++ = *p++;
123 }
124 *t++ = *p++;
125 }
126
127 /*
128 * If the pattern string is empty, use the last RE (not just the
129 * last substitution RE).
130 */
131 if (*ptrn == '\0') {
132 if (sp->re == NULL) {
133 ex_emsg(sp, NULL, EXM_NOPREVRE);
134 return (1);
135 }
136
137 /* Re-compile the RE if necessary. */
138 if (!F_ISSET(sp, SC_RE_SEARCH) &&
139 re_compile(sp, sp->re, sp->re_len,
140 NULL, NULL, &sp->re_c, RE_C_SEARCH))
141 return (1);
142 flags = 0;
143 } else {
144 /*
145 * !!!
146 * Compile the RE. Historic practice is that substitutes set
147 * the search direction as well as both substitute and search
148 * RE's. We compile the RE twice, as we don't want to bother
149 * ref counting the pattern string and (opaque) structure.
150 */
151 if (re_compile(sp, ptrn, t - ptrn, &sp->re,
152 &sp->re_len, &sp->re_c, RE_C_SEARCH))
153 return (1);
154 if (re_compile(sp, ptrn, t - ptrn, &sp->subre,
155 &sp->subre_len, &sp->subre_c, RE_C_SUBST))
156 return (1);
157
158 flags = SUB_FIRST;
159 sp->searchdir = FORWARD;
160 }
161 re = &sp->re_c;
162
163 /*
164 * Get the replacement string.
165 *
166 * The special character & (\& if O_MAGIC not set) matches the
167 * entire RE. No handling of & is required here, it's done by
168 * re_sub().
169 *
170 * The special character ~ (\~ if O_MAGIC not set) inserts the
171 * previous replacement string into this replacement string.
172 * Count ~'s to figure out how much space we need. We could
173 * special case nonexistent last patterns or whether or not
174 * O_MAGIC is set, but it's probably not worth the effort.
175 *
176 * QUOTING NOTE:
177 *
178 * Only toss an escaping character if it escapes a delimiter or
179 * if O_MAGIC is set and it escapes a tilde.
180 *
181 * !!!
182 * If the entire replacement pattern is "%", then use the last
183 * replacement pattern. This semantic was added to vi in System
184 * V and then percolated elsewhere, presumably around the time
185 * that it was added to their version of ed(1).
186 */
187 if (p[0] == '\0' || p[0] == delim) {
188 if (p[0] == delim)
189 ++p;
190 free(sp->repl);
191 sp->repl = NULL;
192 sp->repl_len = 0;
193 } else if (p[0] == '%' && (p[1] == '\0' || p[1] == delim))
194 p += p[1] == delim ? 2 : 1;
195 else {
196 for (rep = p, len = 0;
197 p[0] != '\0' && p[0] != delim; ++p, ++len)
198 if (p[0] == '~')
199 len += sp->repl_len;
200 GET_SPACE_RETW(sp, bp, blen, len);
201 for (t = bp, len = 0, p = rep;;) {
202 if (p[0] == '\0' || p[0] == delim) {
203 if (p[0] == delim)
204 ++p;
205 break;
206 }
207 if (p[0] == '\\') {
208 if (p[1] == delim)
209 ++p;
210 else if (p[1] == '\\') {
211 *t++ = *p++;
212 ++len;
213 } else if (p[1] == '~') {
214 ++p;
215 if (!O_ISSET(sp, O_MAGIC))
216 goto tilde;
217 }
218 } else if (p[0] == '~' && O_ISSET(sp, O_MAGIC)) {
219 tilde: ++p;
220 MEMCPY(t, sp->repl, sp->repl_len);
221 t += sp->repl_len;
222 len += sp->repl_len;
223 continue;
224 }
225 *t++ = *p++;
226 ++len;
227 }
228 if ((sp->repl_len = len) != 0) {
229 free(sp->repl);
230 MALLOC(sp, sp->repl, len * sizeof(CHAR_T));
231 if (sp->repl == NULL) {
232 FREE_SPACEW(sp, bp, blen);
233 return (1);
234 }
235 MEMCPY(sp->repl, bp, len);
236 }
237 FREE_SPACEW(sp, bp, blen);
238 }
239 return (s(sp, cmdp, p, re, flags));
240 }
241
242 /*
243 * ex_subagain --
244 * [line [,line]] & [cgr] [count] [#lp]]
245 *
246 * Substitute using the last substitute RE and replacement pattern.
247 *
248 * PUBLIC: int ex_subagain(SCR *, EXCMD *);
249 */
250 int
ex_subagain(SCR * sp,EXCMD * cmdp)251 ex_subagain(SCR *sp, EXCMD *cmdp)
252 {
253 if (sp->subre == NULL) {
254 ex_emsg(sp, NULL, EXM_NOPREVRE);
255 return (1);
256 }
257 if (!F_ISSET(sp, SC_RE_SUBST) &&
258 re_compile(sp, sp->subre, sp->subre_len,
259 NULL, NULL, &sp->subre_c, RE_C_SUBST))
260 return (1);
261 return (s(sp,
262 cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->subre_c, 0));
263 }
264
265 /*
266 * ex_subtilde --
267 * [line [,line]] ~ [cgr] [count] [#lp]]
268 *
269 * Substitute using the last RE and last substitute replacement pattern.
270 *
271 * PUBLIC: int ex_subtilde(SCR *, EXCMD *);
272 */
273 int
ex_subtilde(SCR * sp,EXCMD * cmdp)274 ex_subtilde(SCR *sp, EXCMD *cmdp)
275 {
276 if (sp->re == NULL) {
277 ex_emsg(sp, NULL, EXM_NOPREVRE);
278 return (1);
279 }
280 if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp, sp->re,
281 sp->re_len, NULL, NULL, &sp->re_c, RE_C_SEARCH))
282 return (1);
283 return (s(sp,
284 cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->re_c, 0));
285 }
286
287 /*
288 * s --
289 * Do the substitution. This stuff is *really* tricky. There are lots of
290 * special cases, and general nastiness. Don't mess with it unless you're
291 * pretty confident.
292 *
293 * The nasty part of the substitution is what happens when the replacement
294 * string contains newlines. It's a bit tricky -- consider the information
295 * that has to be retained for "s/f\(o\)o/^M\1^M\1/". The solution here is
296 * to build a set of newline offsets which we use to break the line up later,
297 * when the replacement is done. Don't change it unless you're *damned*
298 * confident.
299 */
300 #define NEEDNEWLINE(sp) do { \
301 if (sp->newl_len == sp->newl_cnt) { \
302 sp->newl_len += 25; \
303 REALLOC(sp, sp->newl, size_t *, \
304 sp->newl_len * sizeof(size_t)); \
305 if (sp->newl == NULL) { \
306 sp->newl_len = 0; \
307 return (1); \
308 } \
309 } \
310 } while (0)
311
312 #define BUILD(sp, l, len) do { \
313 if (lbclen + (len) > lblen) { \
314 lblen = p2roundup(MAX(lbclen + (len), 256)); \
315 REALLOC(sp, lb, CHAR_T *, lblen * sizeof(CHAR_T)); \
316 if (lb == NULL) { \
317 lbclen = 0; \
318 return (1); \
319 } \
320 } \
321 MEMCPY(lb + lbclen, l, len); \
322 lbclen += len; \
323 } while (0)
324
325 #define NEEDSP(sp, len, pnt) do { \
326 if (lbclen + (len) > lblen) { \
327 lblen = p2roundup(MAX(lbclen + (len), 256)); \
328 REALLOC(sp, lb, CHAR_T *, lblen * sizeof(CHAR_T)); \
329 if (lb == NULL) { \
330 lbclen = 0; \
331 return (1); \
332 } \
333 pnt = lb + lbclen; \
334 } \
335 } while (0)
336
337 static int
s(SCR * sp,EXCMD * cmdp,CHAR_T * s,regex_t * re,u_int flags)338 s(SCR *sp, EXCMD *cmdp, CHAR_T *s, regex_t *re, u_int flags)
339 {
340 EVENT ev;
341 MARK from, to;
342 TEXTH tiq[] = {{ 0 }};
343 recno_t elno, lno, slno;
344 u_long ul;
345 regmatch_t match[10];
346 size_t blen, cnt, last, lbclen, lblen, len, llen;
347 size_t offset, saved_offset, scno;
348 int cflag, lflag, nflag, pflag, rflag;
349 int didsub, do_eol_match, eflags, empty_ok, eval;
350 int linechanged, matched, quit, rval;
351 CHAR_T *bp, *lb;
352 enum nresult nret;
353
354 NEEDFILE(sp, cmdp);
355
356 slno = sp->lno;
357 scno = sp->cno;
358
359 /*
360 * !!!
361 * Historically, the 'g' and 'c' suffices were always toggled as flags,
362 * so ":s/A/B/" was the same as ":s/A/B/ccgg". If O_EDCOMPATIBLE was
363 * not set, they were initialized to 0 for all substitute commands. If
364 * O_EDCOMPATIBLE was set, they were initialized to 0 only if the user
365 * specified substitute/replacement patterns (see ex_s()).
366 */
367 if (!O_ISSET(sp, O_EDCOMPATIBLE))
368 sp->c_suffix = sp->g_suffix = 0;
369
370 /*
371 * Historic vi permitted the '#', 'l' and 'p' options in vi mode, but
372 * it only displayed the last change. I'd disallow them, but they are
373 * useful in combination with the [v]global commands. In the current
374 * model the problem is combining them with the 'c' flag -- the screen
375 * would have to flip back and forth between the confirm screen and the
376 * ex print screen, which would be pretty awful. We do display all
377 * changes, though, for what that's worth.
378 *
379 * !!!
380 * Historic vi was fairly strict about the order of "options", the
381 * count, and "flags". I'm somewhat fuzzy on the difference between
382 * options and flags, anyway, so this is a simpler approach, and we
383 * just take it them in whatever order the user gives them. (The ex
384 * usage statement doesn't reflect this.)
385 */
386 cflag = lflag = nflag = pflag = rflag = 0;
387 if (s == NULL)
388 goto noargs;
389 for (lno = OOBLNO; *s != '\0'; ++s)
390 switch (*s) {
391 case ' ':
392 case '\t':
393 continue;
394 case '+':
395 ++cmdp->flagoff;
396 break;
397 case '-':
398 --cmdp->flagoff;
399 break;
400 case '0': case '1': case '2': case '3': case '4':
401 case '5': case '6': case '7': case '8': case '9':
402 if (lno != OOBLNO)
403 goto usage;
404 errno = 0;
405 nret = nget_uslong(&ul, s, &s, 10);
406 lno = ul;
407 if (*s == '\0') /* Loop increment correction. */
408 --s;
409 if (nret != NUM_OK) {
410 if (nret == NUM_OVER)
411 msgq(sp, M_ERR, "153|Count overflow");
412 else if (nret == NUM_UNDER)
413 msgq(sp, M_ERR, "154|Count underflow");
414 else
415 msgq(sp, M_SYSERR, NULL);
416 return (1);
417 }
418 /*
419 * In historic vi, the count was inclusive from the
420 * second address.
421 */
422 cmdp->addr1.lno = cmdp->addr2.lno;
423 cmdp->addr2.lno += lno - 1;
424 if (!db_exist(sp, cmdp->addr2.lno) &&
425 db_last(sp, &cmdp->addr2.lno))
426 return (1);
427 break;
428 case '#':
429 nflag = 1;
430 break;
431 case 'c':
432 sp->c_suffix = !sp->c_suffix;
433
434 /* Ex text structure initialization. */
435 if (F_ISSET(sp, SC_EX))
436 TAILQ_INIT(tiq);
437 break;
438 case 'g':
439 sp->g_suffix = !sp->g_suffix;
440 break;
441 case 'l':
442 lflag = 1;
443 break;
444 case 'p':
445 pflag = 1;
446 break;
447 case 'r':
448 if (LF_ISSET(SUB_FIRST)) {
449 msgq(sp, M_ERR,
450 "155|Regular expression specified; r flag meaningless");
451 return (1);
452 }
453 if (!F_ISSET(sp, SC_RE_SEARCH)) {
454 ex_emsg(sp, NULL, EXM_NOPREVRE);
455 return (1);
456 }
457 rflag = 1;
458 re = &sp->re_c;
459 break;
460 default:
461 goto usage;
462 }
463
464 if (*s != '\0' || (!rflag && LF_ISSET(SUB_MUSTSETR))) {
465 usage: ex_emsg(sp, cmdp->cmd->usage, EXM_USAGE);
466 return (1);
467 }
468
469 noargs: if (F_ISSET(sp, SC_VI) && sp->c_suffix && (lflag || nflag || pflag)) {
470 msgq(sp, M_ERR,
471 "156|The #, l and p flags may not be combined with the c flag in vi mode");
472 return (1);
473 }
474
475 /*
476 * bp: if interactive, line cache
477 * blen: if interactive, line cache length
478 * lb: build buffer pointer.
479 * lbclen: current length of built buffer.
480 * lblen; length of build buffer.
481 */
482 bp = lb = NULL;
483 blen = lbclen = lblen = 0;
484
485 /* For each line... */
486 lno = cmdp->addr1.lno == 0 ? 1 : cmdp->addr1.lno;
487 for (matched = quit = 0,
488 elno = cmdp->addr2.lno; !quit && lno <= elno; ++lno) {
489
490 /* Someone's unhappy, time to stop. */
491 if (INTERRUPTED(sp))
492 break;
493
494 /* Get the line. */
495 if (db_get(sp, lno, DBG_FATAL, &s, &llen))
496 goto err;
497
498 /*
499 * Make a local copy if doing confirmation -- when calling
500 * the confirm routine we're likely to lose the cached copy.
501 */
502 if (sp->c_suffix) {
503 if (bp == NULL) {
504 GET_SPACE_RETW(sp, bp, blen, llen);
505 } else
506 ADD_SPACE_RETW(sp, bp, blen, llen);
507 MEMCPY(bp, s, llen);
508 s = bp;
509 }
510
511 /* Start searching from the beginning. */
512 offset = 0;
513 len = llen;
514
515 /* Reset the build buffer offset. */
516 lbclen = 0;
517
518 /* Reset empty match flag. */
519 empty_ok = 1;
520
521 /*
522 * We don't want to have to do a setline if the line didn't
523 * change -- keep track of whether or not this line changed.
524 * If doing confirmations, don't want to keep setting the
525 * line if change is refused -- keep track of substitutions.
526 */
527 didsub = linechanged = 0;
528
529 /* New line, do an EOL match. */
530 do_eol_match = 1;
531
532 /* It's not nul terminated, but we pretend it is. */
533 eflags = REG_STARTEND;
534
535 /*
536 * The search area is from s + offset to the EOL.
537 *
538 * Generally, match[0].rm_so is the offset of the start
539 * of the match from the start of the search, and offset
540 * is the offset of the start of the last search.
541 */
542 nextmatch: match[0].rm_so = 0;
543 match[0].rm_eo = len;
544
545 /* Get the next match. */
546 eval = regexec(re, s + offset, 10, match, eflags);
547
548 /*
549 * There wasn't a match or if there was an error, deal with
550 * it. If there was a previous match in this line, resolve
551 * the changes into the database. Otherwise, just move on.
552 */
553 if (eval == REG_NOMATCH)
554 goto endmatch;
555 if (eval != 0) {
556 re_error(sp, eval, re);
557 goto err;
558 }
559 matched = 1;
560
561 /* Only the first search can match an anchored expression. */
562 eflags |= REG_NOTBOL;
563
564 /*
565 * !!!
566 * It's possible to match 0-length strings -- for example, the
567 * command s;a*;X;, when matched against the string "aabb" will
568 * result in "XbXbX", i.e. the matches are "aa", the space
569 * between the b's and the space between the b's and the end of
570 * the string. There is a similar space between the beginning
571 * of the string and the a's. The rule that we use (because vi
572 * historically used it) is that any 0-length match, occurring
573 * immediately after a match, is ignored. Otherwise, the above
574 * example would have resulted in "XXbXbX". Another example is
575 * incorrectly using " *" to replace groups of spaces with one
576 * space.
577 *
578 * The way we do this is that if we just had a successful match,
579 * the starting offset does not skip characters, and the match
580 * is empty, ignore the match and move forward. If there's no
581 * more characters in the string, we were attempting to match
582 * after the last character, so quit.
583 */
584 if (!empty_ok && match[0].rm_so == 0 && match[0].rm_eo == 0) {
585 empty_ok = 1;
586 if (len == 0)
587 goto endmatch;
588 BUILD(sp, s + offset, 1);
589 ++offset;
590 --len;
591 goto nextmatch;
592 }
593
594 /* Confirm change. */
595 if (sp->c_suffix) {
596 /*
597 * Set the cursor position for confirmation. Note,
598 * if we matched on a '$', the cursor may be past
599 * the end of line.
600 */
601 from.lno = to.lno = lno;
602 from.cno = match[0].rm_so + offset;
603 to.cno = match[0].rm_eo + offset;
604 /*
605 * Both ex and vi have to correct for a change before
606 * the first character in the line.
607 */
608 if (llen == 0)
609 from.cno = to.cno = 0;
610 if (F_ISSET(sp, SC_VI)) {
611 /*
612 * Only vi has to correct for a change after
613 * the last character in the line.
614 *
615 * XXX
616 * It would be nice to change the vi code so
617 * that we could display a cursor past EOL.
618 */
619 if (to.cno >= llen)
620 to.cno = llen - 1;
621 if (from.cno >= llen)
622 from.cno = llen - 1;
623
624 sp->lno = from.lno;
625 sp->cno = from.cno;
626 if (vs_refresh(sp, 1))
627 goto err;
628
629 vs_update(sp, msg_cat(sp,
630 "169|Confirm change? [n]", NULL), NULL);
631
632 if (v_event_get(sp, &ev, 0, 0))
633 goto err;
634 switch (ev.e_event) {
635 case E_CHARACTER:
636 break;
637 case E_EOF:
638 case E_ERR:
639 case E_INTERRUPT:
640 goto lquit;
641 default:
642 v_event_err(sp, &ev);
643 goto lquit;
644 }
645 } else {
646 const int flags =
647 O_ISSET(sp, O_NUMBER) ? E_C_HASH : 0;
648 if (ex_print(sp, cmdp, &from, &to, flags) ||
649 ex_scprint(sp, &from, &to))
650 goto lquit;
651 if (ex_txt(sp, tiq, 0, TXT_CR))
652 goto err;
653 ev.e_c = TAILQ_FIRST(tiq)->lb[0];
654 }
655
656 switch (ev.e_c) {
657 case CH_YES:
658 break;
659 default:
660 case CH_NO:
661 didsub = 0;
662 BUILD(sp, s +offset, match[0].rm_eo);
663 goto skip;
664 case CH_QUIT:
665 /* Set the quit/interrupted flags. */
666 lquit: quit = 1;
667 F_SET(sp->gp, G_INTERRUPTED);
668
669 /*
670 * Resolve any changes, then return to (and
671 * exit from) the main loop.
672 */
673 goto endmatch;
674 }
675 }
676
677 /*
678 * Set the cursor to the last position changed, converting
679 * from 1-based to 0-based.
680 */
681 sp->lno = lno;
682 sp->cno = match[0].rm_so;
683
684 /* Copy the bytes before the match into the build buffer. */
685 BUILD(sp, s + offset, match[0].rm_so);
686
687 /* Substitute the matching bytes. */
688 didsub = 1;
689 if (re_sub(sp, s + offset, &lb, &lbclen, &lblen, match))
690 goto err;
691
692 /* Set the change flag so we know this line was modified. */
693 linechanged = 1;
694
695 /* Move past the matched bytes. */
696 skip: offset += match[0].rm_eo;
697 len -= match[0].rm_eo;
698
699 /* A match cannot be followed by an empty pattern. */
700 empty_ok = 0;
701
702 /*
703 * If doing a global change with confirmation, we have to
704 * update the screen. The basic idea is to store the line
705 * so the screen update routines can find it, and restart.
706 */
707 if (didsub && sp->c_suffix && sp->g_suffix) {
708 /*
709 * The new search offset will be the end of the
710 * modified line.
711 */
712 saved_offset = lbclen;
713
714 /* Copy the rest of the line. */
715 if (len)
716 BUILD(sp, s + offset, len);
717
718 /* Set the new offset. */
719 offset = saved_offset;
720
721 /* Store inserted lines, adjusting the build buffer. */
722 last = 0;
723 if (sp->newl_cnt) {
724 for (cnt = 0;
725 cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) {
726 if (db_insert(sp, lno,
727 lb + last, sp->newl[cnt] - last))
728 goto err;
729 last = sp->newl[cnt] + 1;
730 ++sp->rptlines[L_ADDED];
731 }
732 lbclen -= last;
733 offset -= last;
734 sp->newl_cnt = 0;
735 }
736
737 /* Store and retrieve the line. */
738 if (db_set(sp, lno, lb + last, lbclen))
739 goto err;
740 if (db_get(sp, lno, DBG_FATAL, &s, &llen))
741 goto err;
742 ADD_SPACE_RETW(sp, bp, blen, llen);
743 MEMCPY(bp, s, llen);
744 s = bp;
745 len = llen - offset;
746
747 /* Restart the build. */
748 lbclen = 0;
749 BUILD(sp, s, offset);
750
751 /*
752 * If we haven't already done the after-the-string
753 * match, do one. Set REG_NOTEOL so the '$' pattern
754 * only matches once.
755 */
756 if (!do_eol_match)
757 goto endmatch;
758 if (offset == len) {
759 do_eol_match = 0;
760 eflags |= REG_NOTEOL;
761 }
762 goto nextmatch;
763 }
764
765 /*
766 * If it's a global:
767 *
768 * If at the end of the string, do a test for the after
769 * the string match. Set REG_NOTEOL so the '$' pattern
770 * only matches once.
771 */
772 if (sp->g_suffix && do_eol_match) {
773 if (len == 0) {
774 do_eol_match = 0;
775 eflags |= REG_NOTEOL;
776 }
777 goto nextmatch;
778 }
779
780 endmatch: if (!linechanged)
781 continue;
782
783 /* Copy any remaining bytes into the build buffer. */
784 if (len)
785 BUILD(sp, s + offset, len);
786
787 /* Store inserted lines, adjusting the build buffer. */
788 last = 0;
789 if (sp->newl_cnt) {
790 for (cnt = 0;
791 cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) {
792 if (db_insert(sp,
793 lno, lb + last, sp->newl[cnt] - last))
794 goto err;
795 last = sp->newl[cnt] + 1;
796 ++sp->rptlines[L_ADDED];
797 }
798 lbclen -= last;
799 sp->newl_cnt = 0;
800 }
801
802 /* Store the changed line. */
803 if (db_set(sp, lno, lb + last, lbclen))
804 goto err;
805
806 /* Update changed line counter. */
807 if (sp->rptlchange != lno) {
808 sp->rptlchange = lno;
809 ++sp->rptlines[L_CHANGED];
810 }
811
812 /*
813 * !!!
814 * Display as necessary. Historic practice is to only
815 * display the last line of a line split into multiple
816 * lines.
817 */
818 if (lflag || nflag || pflag) {
819 from.lno = to.lno = lno;
820 from.cno = to.cno = 0;
821 if (lflag)
822 (void)ex_print(sp, cmdp, &from, &to, E_C_LIST);
823 if (nflag)
824 (void)ex_print(sp, cmdp, &from, &to, E_C_HASH);
825 if (pflag)
826 (void)ex_print(sp, cmdp, &from, &to, E_C_PRINT);
827 }
828 }
829
830 /*
831 * !!!
832 * Historically, vi attempted to leave the cursor at the same place if
833 * the substitution was done at the current cursor position. Otherwise
834 * it moved it to the first non-blank of the last line changed. There
835 * were some problems: for example, :s/$/foo/ with the cursor on the
836 * last character of the line left the cursor on the last character, or
837 * the & command with multiple occurrences of the matching string in the
838 * line usually left the cursor in a fairly random position.
839 *
840 * We try to do the same thing, with the exception that if the user is
841 * doing substitution with confirmation, we move to the last line about
842 * which the user was consulted, as opposed to the last line that they
843 * actually changed. This prevents a screen flash if the user doesn't
844 * change many of the possible lines.
845 */
846 if (!sp->c_suffix && (sp->lno != slno || sp->cno != scno)) {
847 sp->cno = 0;
848 (void)nonblank(sp, sp->lno, &sp->cno);
849 }
850
851 /*
852 * If not in a global command, and nothing matched, say so.
853 * Else, if none of the lines displayed, put something up.
854 */
855 rval = 0;
856 if (!matched) {
857 if (!F_ISSET(sp, SC_EX_GLOBAL)) {
858 msgq(sp, M_ERR, "157|No match found");
859 goto err;
860 }
861 } else if (!lflag && !nflag && !pflag)
862 F_SET(cmdp, E_AUTOPRINT);
863
864 if (0) {
865 err: rval = 1;
866 }
867
868 if (bp != NULL)
869 FREE_SPACEW(sp, bp, blen);
870 free(lb);
871 return (rval);
872 }
873
874 /*
875 * re_compile --
876 * Compile the RE.
877 *
878 * PUBLIC: int re_compile(SCR *,
879 * PUBLIC: CHAR_T *, size_t, CHAR_T **, size_t *, regex_t *, u_int);
880 */
881 int
re_compile(SCR * sp,CHAR_T * ptrn,size_t plen,CHAR_T ** ptrnp,size_t * lenp,regex_t * rep,u_int flags)882 re_compile(SCR *sp, CHAR_T *ptrn, size_t plen, CHAR_T **ptrnp, size_t *lenp, regex_t *rep, u_int flags)
883 {
884 size_t len;
885 int reflags, replaced, rval;
886 CHAR_T *p;
887
888 /* Set RE flags. */
889 reflags = 0;
890 if (!LF_ISSET(RE_C_CSCOPE | RE_C_TAG)) {
891 if (O_ISSET(sp, O_EXTENDED))
892 reflags |= REG_EXTENDED;
893 if (O_ISSET(sp, O_IGNORECASE))
894 reflags |= REG_ICASE;
895 if (O_ISSET(sp, O_ICLOWER)) {
896 for (p = ptrn, len = plen; len > 0; ++p, --len)
897 if (ISUPPER(*p))
898 break;
899 if (len == 0)
900 reflags |= REG_ICASE;
901 }
902 }
903
904 /* If we're replacing a saved value, clear the old one. */
905 if (LF_ISSET(RE_C_SEARCH) && F_ISSET(sp, SC_RE_SEARCH)) {
906 regfree(&sp->re_c);
907 F_CLR(sp, SC_RE_SEARCH);
908 }
909 if (LF_ISSET(RE_C_SUBST) && F_ISSET(sp, SC_RE_SUBST)) {
910 regfree(&sp->subre_c);
911 F_CLR(sp, SC_RE_SUBST);
912 }
913
914 /*
915 * If we're saving the string, it's a pattern we haven't seen before,
916 * so convert the vi-style RE's to POSIX 1003.2 RE's. Save a copy for
917 * later recompilation. Free any previously saved value.
918 */
919 if (ptrnp != NULL) {
920 replaced = 0;
921 if (LF_ISSET(RE_C_CSCOPE)) {
922 if (re_cscope_conv(sp, &ptrn, &plen, &replaced))
923 return (1);
924 /*
925 * XXX
926 * Currently, the match-any-<blank> expression used in
927 * re_cscope_conv() requires extended RE's. This may
928 * not be right or safe.
929 */
930 reflags |= REG_EXTENDED;
931 } else if (LF_ISSET(RE_C_TAG)) {
932 if (re_tag_conv(sp, &ptrn, &plen, &replaced))
933 return (1);
934 } else
935 if (re_conv(sp, &ptrn, &plen, &replaced))
936 return (1);
937
938 /* Discard previous pattern. */
939 free(*ptrnp);
940 *ptrnp = NULL;
941
942 if (lenp != NULL)
943 *lenp = plen;
944
945 /*
946 * Copy the string into allocated memory.
947 *
948 * XXX
949 * Regcomp isn't 8-bit clean, so the pattern is nul-terminated
950 * for now. There's just no other solution.
951 */
952 MALLOC(sp, *ptrnp, (plen + 1) * sizeof(CHAR_T));
953 if (*ptrnp != NULL) {
954 MEMCPY(*ptrnp, ptrn, plen);
955 (*ptrnp)[plen] = '\0';
956 }
957
958 /* Free up conversion-routine-allocated memory. */
959 if (replaced)
960 FREE_SPACEW(sp, ptrn, 0);
961
962 if (*ptrnp == NULL)
963 return (1);
964
965 ptrn = *ptrnp;
966 }
967
968 /*
969 * XXX
970 * Regcomp isn't 8-bit clean, so we just lost if the pattern
971 * contained a nul. Bummer!
972 */
973 if ((rval = regcomp(rep, ptrn, /* plen, */ reflags)) != 0) {
974 if (!LF_ISSET(RE_C_SILENT))
975 re_error(sp, rval, rep);
976 return (1);
977 }
978
979 if (LF_ISSET(RE_C_SEARCH))
980 F_SET(sp, SC_RE_SEARCH);
981 if (LF_ISSET(RE_C_SUBST))
982 F_SET(sp, SC_RE_SUBST);
983
984 return (0);
985 }
986
987 /*
988 * re_conv --
989 * Convert vi's regular expressions into something that the
990 * the POSIX 1003.2 RE functions can handle.
991 *
992 * There are three conversions we make to make vi's RE's (specifically
993 * the global, search, and substitute patterns) work with POSIX RE's.
994 *
995 * 1: If O_MAGIC is not set, strip backslashes from the magic character
996 * set (.[*~) that have them, and add them to the ones that don't.
997 * 2: If O_MAGIC is not set, the string "\~" is replaced with the text
998 * from the last substitute command's replacement string. If O_MAGIC
999 * is set, it's the string "~".
1000 * 3: The pattern \<ptrn\> does "word" searches, convert it to use the
1001 * new RE escapes.
1002 *
1003 * !!!/XXX
1004 * This doesn't exactly match the historic behavior of vi because we do
1005 * the ~ substitution before calling the RE engine, so magic characters
1006 * in the replacement string will be expanded by the RE engine, and they
1007 * weren't historically. It's a bug.
1008 */
1009 static int
re_conv(SCR * sp,CHAR_T ** ptrnp,size_t * plenp,int * replacedp)1010 re_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp)
1011 {
1012 size_t blen, len, needlen;
1013 int magic;
1014 CHAR_T *bp, *p, *t;
1015
1016 /*
1017 * First pass through, we figure out how much space we'll need.
1018 * We do it in two passes, on the grounds that most of the time
1019 * the user is doing a search and won't have magic characters.
1020 * That way we can skip most of the memory allocation and copies.
1021 */
1022 magic = 0;
1023 for (p = *ptrnp, len = *plenp, needlen = 0; len > 0; ++p, --len)
1024 switch (*p) {
1025 case '\\':
1026 if (len > 1) {
1027 --len;
1028 switch (*++p) {
1029 case '<':
1030 magic = 1;
1031 needlen += RE_WSTART_LEN + 1;
1032 break;
1033 case '>':
1034 magic = 1;
1035 needlen += RE_WSTOP_LEN + 1;
1036 break;
1037 case '~':
1038 if (!O_ISSET(sp, O_MAGIC)) {
1039 magic = 1;
1040 needlen += sp->repl_len;
1041 }
1042 break;
1043 case '.':
1044 case '[':
1045 case '*':
1046 if (!O_ISSET(sp, O_MAGIC)) {
1047 magic = 1;
1048 needlen += 1;
1049 }
1050 break;
1051 default:
1052 needlen += 2;
1053 }
1054 } else
1055 needlen += 1;
1056 break;
1057 case '~':
1058 if (O_ISSET(sp, O_MAGIC)) {
1059 magic = 1;
1060 needlen += sp->repl_len;
1061 }
1062 break;
1063 case '.':
1064 case '[':
1065 case '*':
1066 if (!O_ISSET(sp, O_MAGIC)) {
1067 magic = 1;
1068 needlen += 2;
1069 }
1070 break;
1071 default:
1072 needlen += 1;
1073 break;
1074 }
1075
1076 if (!magic) {
1077 *replacedp = 0;
1078 return (0);
1079 }
1080
1081 /* Get enough memory to hold the final pattern. */
1082 *replacedp = 1;
1083 GET_SPACE_RETW(sp, bp, blen, needlen);
1084
1085 for (p = *ptrnp, len = *plenp, t = bp; len > 0; ++p, --len)
1086 switch (*p) {
1087 case '\\':
1088 if (len > 1) {
1089 --len;
1090 switch (*++p) {
1091 case '<':
1092 MEMCPY(t,
1093 RE_WSTART, RE_WSTART_LEN);
1094 t += RE_WSTART_LEN;
1095 break;
1096 case '>':
1097 MEMCPY(t,
1098 RE_WSTOP, RE_WSTOP_LEN);
1099 t += RE_WSTOP_LEN;
1100 break;
1101 case '~':
1102 if (O_ISSET(sp, O_MAGIC))
1103 *t++ = '~';
1104 else {
1105 MEMCPY(t,
1106 sp->repl, sp->repl_len);
1107 t += sp->repl_len;
1108 }
1109 break;
1110 case '.':
1111 case '[':
1112 case '*':
1113 if (O_ISSET(sp, O_MAGIC))
1114 *t++ = '\\';
1115 *t++ = *p;
1116 break;
1117 default:
1118 *t++ = '\\';
1119 *t++ = *p;
1120 }
1121 } else
1122 *t++ = '\\';
1123 break;
1124 case '~':
1125 if (O_ISSET(sp, O_MAGIC)) {
1126 MEMCPY(t, sp->repl, sp->repl_len);
1127 t += sp->repl_len;
1128 } else
1129 *t++ = '~';
1130 break;
1131 case '.':
1132 case '[':
1133 case '*':
1134 if (!O_ISSET(sp, O_MAGIC))
1135 *t++ = '\\';
1136 *t++ = *p;
1137 break;
1138 default:
1139 *t++ = *p;
1140 break;
1141 }
1142
1143 *ptrnp = bp;
1144 *plenp = t - bp;
1145 return (0);
1146 }
1147
1148 /*
1149 * re_tag_conv --
1150 * Convert a tags search path into something that the POSIX
1151 * 1003.2 RE functions can handle.
1152 */
1153 static int
re_tag_conv(SCR * sp,CHAR_T ** ptrnp,size_t * plenp,int * replacedp)1154 re_tag_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp)
1155 {
1156 size_t blen, len;
1157 int lastdollar;
1158 CHAR_T *bp, *p, *t;
1159
1160 len = *plenp;
1161
1162 /* Max memory usage is 2 times the length of the string. */
1163 *replacedp = 1;
1164 GET_SPACE_RETW(sp, bp, blen, len * 2);
1165
1166 p = *ptrnp;
1167 t = bp;
1168
1169 /* If the last character is a '/' or '?', we just strip it. */
1170 if (len > 0 && (p[len - 1] == '/' || p[len - 1] == '?'))
1171 --len;
1172
1173 /* If the next-to-last or last character is a '$', it's magic. */
1174 if (len > 0 && p[len - 1] == '$') {
1175 --len;
1176 lastdollar = 1;
1177 } else
1178 lastdollar = 0;
1179
1180 /* If the first character is a '/' or '?', we just strip it. */
1181 if (len > 0 && (p[0] == '/' || p[0] == '?')) {
1182 ++p;
1183 --len;
1184 }
1185
1186 /* If the first or second character is a '^', it's magic. */
1187 if (p[0] == '^') {
1188 *t++ = *p++;
1189 --len;
1190 }
1191
1192 /*
1193 * Escape every other magic character we can find, meanwhile stripping
1194 * the backslashes ctags inserts when escaping the search delimiter
1195 * characters.
1196 */
1197 for (; len > 0; --len) {
1198 if (p[0] == '\\' && (p[1] == '/' || p[1] == '?')) {
1199 ++p;
1200 if (len > 1)
1201 --len;
1202 } else if (STRCHR(L("^.[]$*"), p[0]))
1203 *t++ = '\\';
1204 *t++ = *p++;
1205 }
1206 if (lastdollar)
1207 *t++ = '$';
1208
1209 *ptrnp = bp;
1210 *plenp = t - bp;
1211 return (0);
1212 }
1213
1214 /*
1215 * re_cscope_conv --
1216 * Convert a cscope search path into something that the POSIX
1217 * 1003.2 RE functions can handle.
1218 */
1219 static int
re_cscope_conv(SCR * sp,CHAR_T ** ptrnp,size_t * plenp,int * replacedp)1220 re_cscope_conv(SCR *sp, CHAR_T **ptrnp, size_t *plenp, int *replacedp)
1221 {
1222 size_t blen, len, nspaces;
1223 CHAR_T *bp, *t;
1224 CHAR_T *p;
1225 CHAR_T *wp;
1226 size_t wlen;
1227
1228 /*
1229 * Each space in the source line printed by cscope represents an
1230 * arbitrary sequence of spaces, tabs, and comments.
1231 */
1232 #define CSCOPE_RE_SPACE "([ \t]|/\\*([^*]|\\*/)*\\*/)*"
1233 #define CSCOPE_LEN sizeof(CSCOPE_RE_SPACE) - 1
1234 CHAR2INT(sp, CSCOPE_RE_SPACE, CSCOPE_LEN, wp, wlen);
1235 for (nspaces = 0, p = *ptrnp, len = *plenp; len > 0; ++p, --len)
1236 if (*p == ' ')
1237 ++nspaces;
1238
1239 /*
1240 * Allocate plenty of space:
1241 * the string, plus potential escaping characters;
1242 * nspaces + 2 copies of CSCOPE_RE_SPACE;
1243 * ^, $, nul terminator characters.
1244 */
1245 *replacedp = 1;
1246 len = (p - *ptrnp) * 2 + (nspaces + 2) * sizeof(CSCOPE_RE_SPACE) + 3;
1247 GET_SPACE_RETW(sp, bp, blen, len);
1248
1249 p = *ptrnp;
1250 t = bp;
1251
1252 *t++ = '^';
1253 MEMCPY(t, wp, wlen);
1254 t += wlen;
1255
1256 for (len = *plenp; len > 0; ++p, --len)
1257 if (*p == ' ') {
1258 MEMCPY(t, wp, wlen);
1259 t += wlen;
1260 } else {
1261 if (STRCHR(L("\\^.[]$*+?()|{}"), *p))
1262 *t++ = '\\';
1263 *t++ = *p;
1264 }
1265
1266 MEMCPY(t, wp, wlen);
1267 t += wlen;
1268 *t++ = '$';
1269
1270 *ptrnp = bp;
1271 *plenp = t - bp;
1272 return (0);
1273 }
1274
1275 /*
1276 * re_error --
1277 * Report a regular expression error.
1278 *
1279 * PUBLIC: void re_error(SCR *, int, regex_t *);
1280 */
1281 void
re_error(SCR * sp,int errcode,regex_t * preg)1282 re_error(SCR *sp, int errcode, regex_t *preg)
1283 {
1284 size_t s;
1285 char *oe;
1286
1287 s = regerror(errcode, preg, "", 0);
1288 MALLOC(sp, oe, s);
1289 if (oe != NULL) {
1290 (void)regerror(errcode, preg, oe, s);
1291 msgq(sp, M_ERR, "RE error: %s", oe);
1292 free(oe);
1293 }
1294 }
1295
1296 /*
1297 * re_sub --
1298 * Do the substitution for a regular expression.
1299 */
1300 static int
re_sub(SCR * sp,CHAR_T * ip,CHAR_T ** lbp,size_t * lbclenp,size_t * lblenp,regmatch_t match[10])1301 re_sub(
1302 SCR *sp,
1303 CHAR_T *ip, /* Input line. */
1304 CHAR_T **lbp,
1305 size_t *lbclenp,
1306 size_t *lblenp,
1307 regmatch_t match[10])
1308 {
1309 enum { C_NOTSET, C_LOWER, C_ONELOWER, C_ONEUPPER, C_UPPER } conv;
1310 size_t lbclen, lblen; /* Local copies. */
1311 size_t mlen; /* Match length. */
1312 size_t rpl; /* Remaining replacement length. */
1313 CHAR_T *rp; /* Replacement pointer. */
1314 int ch;
1315 int no; /* Match replacement offset. */
1316 CHAR_T *p, *t; /* Buffer pointers. */
1317 CHAR_T *lb; /* Local copies. */
1318
1319 lb = *lbp; /* Get local copies. */
1320 lbclen = *lbclenp;
1321 lblen = *lblenp;
1322
1323 /*
1324 * QUOTING NOTE:
1325 *
1326 * There are some special sequences that vi provides in the
1327 * replacement patterns.
1328 * & string the RE matched (\& if nomagic set)
1329 * \# n-th regular subexpression
1330 * \E end \U, \L conversion
1331 * \e end \U, \L conversion
1332 * \l convert the next character to lower-case
1333 * \L convert to lower-case, until \E, \e, or end of replacement
1334 * \u convert the next character to upper-case
1335 * \U convert to upper-case, until \E, \e, or end of replacement
1336 *
1337 * Otherwise, since this is the lowest level of replacement, discard
1338 * all escaping characters. This (hopefully) matches historic practice.
1339 */
1340 #define OUTCH(ch, nltrans) do { \
1341 ARG_CHAR_T __ch = (ch); \
1342 e_key_t __value = KEY_VAL(sp, __ch); \
1343 if (nltrans && (__value == K_CR || __value == K_NL)) { \
1344 NEEDNEWLINE(sp); \
1345 sp->newl[sp->newl_cnt++] = lbclen; \
1346 } else if (conv != C_NOTSET) { \
1347 switch (conv) { \
1348 case C_ONELOWER: \
1349 conv = C_NOTSET; \
1350 /* FALLTHROUGH */ \
1351 case C_LOWER: \
1352 if (ISUPPER(__ch)) \
1353 __ch = TOLOWER(__ch); \
1354 break; \
1355 case C_ONEUPPER: \
1356 conv = C_NOTSET; \
1357 /* FALLTHROUGH */ \
1358 case C_UPPER: \
1359 if (ISLOWER(__ch)) \
1360 __ch = TOUPPER(__ch); \
1361 break; \
1362 default: \
1363 abort(); \
1364 } \
1365 } \
1366 NEEDSP(sp, 1, p); \
1367 *p++ = __ch; \
1368 ++lbclen; \
1369 } while (0)
1370 conv = C_NOTSET;
1371 for (rp = sp->repl, rpl = sp->repl_len, p = lb + lbclen; rpl--;) {
1372 switch (ch = *rp++) {
1373 case '&':
1374 if (O_ISSET(sp, O_MAGIC)) {
1375 no = 0;
1376 goto subzero;
1377 }
1378 break;
1379 case '\\':
1380 if (rpl == 0)
1381 break;
1382 --rpl;
1383 switch (ch = *rp) {
1384 case '&':
1385 ++rp;
1386 if (!O_ISSET(sp, O_MAGIC)) {
1387 no = 0;
1388 goto subzero;
1389 }
1390 break;
1391 case '0': case '1': case '2': case '3': case '4':
1392 case '5': case '6': case '7': case '8': case '9':
1393 no = *rp++ - '0';
1394 subzero: if (match[no].rm_so == -1 ||
1395 match[no].rm_eo == -1)
1396 break;
1397 mlen = match[no].rm_eo - match[no].rm_so;
1398 for (t = ip + match[no].rm_so; mlen--; ++t)
1399 OUTCH(*t, 0);
1400 continue;
1401 case 'e':
1402 case 'E':
1403 ++rp;
1404 conv = C_NOTSET;
1405 continue;
1406 case 'l':
1407 ++rp;
1408 conv = C_ONELOWER;
1409 continue;
1410 case 'L':
1411 ++rp;
1412 conv = C_LOWER;
1413 continue;
1414 case 'u':
1415 ++rp;
1416 conv = C_ONEUPPER;
1417 continue;
1418 case 'U':
1419 ++rp;
1420 conv = C_UPPER;
1421 continue;
1422 case '\r':
1423 OUTCH(ch, 0);
1424 continue;
1425 default:
1426 ++rp;
1427 break;
1428 }
1429 }
1430 OUTCH(ch, 1);
1431 }
1432
1433 *lbp = lb; /* Update caller's information. */
1434 *lbclenp = lbclen;
1435 *lblenp = lblen;
1436 return (0);
1437 }
1438