1 /* Analyze file differences for GNU DIFF.
2
3 Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
4 2004 Free Software Foundation, Inc.
5
6 This file is part of GNU DIFF.
7
8 GNU DIFF is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 GNU DIFF is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING.
20 If not, write to the Free Software Foundation,
21 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
22
23 /* The basic algorithm is described in:
24 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
25 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
26 see especially section 4.2, which describes the variation used below.
27 Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
28 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
29 at the price of producing suboptimal output for large inputs with
30 many differences.
31
32 The basic algorithm was independently discovered as described in:
33 "Algorithms for Approximate String Matching", E. Ukkonen,
34 Information and Control Vol. 64, 1985, pp. 100-118. */
35
36 #include "diff.h"
37 #include <cmpbuf.h>
38 #include <error.h>
39 #include <file-type.h>
40 #include <xalloc.h>
41
42 static lin *xvec, *yvec; /* Vectors being compared. */
43 static lin *fdiag; /* Vector, indexed by diagonal, containing
44 1 + the X coordinate of the point furthest
45 along the given diagonal in the forward
46 search of the edit matrix. */
47 static lin *bdiag; /* Vector, indexed by diagonal, containing
48 the X coordinate of the point furthest
49 along the given diagonal in the backward
50 search of the edit matrix. */
51 static lin too_expensive; /* Edit scripts longer than this are too
52 expensive to compute. */
53
54 #define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'. */
55
56 struct partition
57 {
58 lin xmid, ymid; /* Midpoints of this partition. */
59 bool lo_minimal; /* Nonzero if low half will be analyzed minimally. */
60 bool hi_minimal; /* Likewise for high half. */
61 };
62
63 /* Find the midpoint of the shortest edit script for a specified
64 portion of the two files.
65
66 Scan from the beginnings of the files, and simultaneously from the ends,
67 doing a breadth-first search through the space of edit-sequence.
68 When the two searches meet, we have found the midpoint of the shortest
69 edit sequence.
70
71 If FIND_MINIMAL is nonzero, find the minimal edit script regardless
72 of expense. Otherwise, if the search is too expensive, use
73 heuristics to stop the search and report a suboptimal answer.
74
75 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
76 XMID - YMID equals the number of inserted lines minus the number
77 of deleted lines (counting only lines before the midpoint).
78
79 Set PART->lo_minimal to true iff the minimal edit script for the
80 left half of the partition is known; similarly for PART->hi_minimal.
81
82 This function assumes that the first lines of the specified portions
83 of the two files do not match, and likewise that the last lines do not
84 match. The caller must trim matching lines from the beginning and end
85 of the portions it is going to specify.
86
87 If we return the "wrong" partitions,
88 the worst this can do is cause suboptimal diff output.
89 It cannot cause incorrect diff output. */
90
91 static void
diag(lin xoff,lin xlim,lin yoff,lin ylim,bool find_minimal,struct partition * part)92 diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal,
93 struct partition *part)
94 {
95 lin *const fd = fdiag; /* Give the compiler a chance. */
96 lin *const bd = bdiag; /* Additional help for the compiler. */
97 lin const *const xv = xvec; /* Still more help for the compiler. */
98 lin const *const yv = yvec; /* And more and more . . . */
99 lin const dmin = xoff - ylim; /* Minimum valid diagonal. */
100 lin const dmax = xlim - yoff; /* Maximum valid diagonal. */
101 lin const fmid = xoff - yoff; /* Center diagonal of top-down search. */
102 lin const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
103 lin fmin = fmid, fmax = fmid; /* Limits of top-down search. */
104 lin bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */
105 lin c; /* Cost. */
106 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
107 diagonal with respect to the northwest. */
108
109 fd[fmid] = xoff;
110 bd[bmid] = xlim;
111
112 for (c = 1;; ++c)
113 {
114 lin d; /* Active diagonal. */
115 bool big_snake = false;
116
117 /* Extend the top-down search by an edit step in each diagonal. */
118 fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
119 fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
120 for (d = fmax; d >= fmin; d -= 2)
121 {
122 lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
123
124 if (tlo >= thi)
125 x = tlo + 1;
126 else
127 x = thi;
128 oldx = x;
129 y = x - d;
130 while (x < xlim && y < ylim && xv[x] == yv[y])
131 ++x, ++y;
132 if (x - oldx > SNAKE_LIMIT)
133 big_snake = true;
134 fd[d] = x;
135 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
136 {
137 part->xmid = x;
138 part->ymid = y;
139 part->lo_minimal = part->hi_minimal = true;
140 return;
141 }
142 }
143
144 /* Similarly extend the bottom-up search. */
145 bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin;
146 bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax;
147 for (d = bmax; d >= bmin; d -= 2)
148 {
149 lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
150
151 if (tlo < thi)
152 x = tlo;
153 else
154 x = thi - 1;
155 oldx = x;
156 y = x - d;
157 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
158 --x, --y;
159 if (oldx - x > SNAKE_LIMIT)
160 big_snake = true;
161 bd[d] = x;
162 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
163 {
164 part->xmid = x;
165 part->ymid = y;
166 part->lo_minimal = part->hi_minimal = true;
167 return;
168 }
169 }
170
171 if (find_minimal)
172 continue;
173
174 /* Heuristic: check occasionally for a diagonal that has made
175 lots of progress compared with the edit distance.
176 If we have any such, find the one that has made the most
177 progress and return it as if it had succeeded.
178
179 With this heuristic, for files with a constant small density
180 of changes, the algorithm is linear in the file size. */
181
182 if (200 < c && big_snake && speed_large_files)
183 {
184 lin best = 0;
185
186 for (d = fmax; d >= fmin; d -= 2)
187 {
188 lin dd = d - fmid;
189 lin x = fd[d];
190 lin y = x - d;
191 lin v = (x - xoff) * 2 - dd;
192 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
193 {
194 if (v > best
195 && xoff + SNAKE_LIMIT <= x && x < xlim
196 && yoff + SNAKE_LIMIT <= y && y < ylim)
197 {
198 /* We have a good enough best diagonal;
199 now insist that it end with a significant snake. */
200 int k;
201
202 for (k = 1; xv[x - k] == yv[y - k]; k++)
203 if (k == SNAKE_LIMIT)
204 {
205 best = v;
206 part->xmid = x;
207 part->ymid = y;
208 break;
209 }
210 }
211 }
212 }
213 if (best > 0)
214 {
215 part->lo_minimal = true;
216 part->hi_minimal = false;
217 return;
218 }
219
220 best = 0;
221 for (d = bmax; d >= bmin; d -= 2)
222 {
223 lin dd = d - bmid;
224 lin x = bd[d];
225 lin y = x - d;
226 lin v = (xlim - x) * 2 + dd;
227 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
228 {
229 if (v > best
230 && xoff < x && x <= xlim - SNAKE_LIMIT
231 && yoff < y && y <= ylim - SNAKE_LIMIT)
232 {
233 /* We have a good enough best diagonal;
234 now insist that it end with a significant snake. */
235 int k;
236
237 for (k = 0; xv[x + k] == yv[y + k]; k++)
238 if (k == SNAKE_LIMIT - 1)
239 {
240 best = v;
241 part->xmid = x;
242 part->ymid = y;
243 break;
244 }
245 }
246 }
247 }
248 if (best > 0)
249 {
250 part->lo_minimal = false;
251 part->hi_minimal = true;
252 return;
253 }
254 }
255
256 /* Heuristic: if we've gone well beyond the call of duty,
257 give up and report halfway between our best results so far. */
258 if (c >= too_expensive)
259 {
260 lin fxybest, fxbest;
261 lin bxybest, bxbest;
262
263 fxbest = bxbest = 0; /* Pacify `gcc -Wall'. */
264
265 /* Find forward diagonal that maximizes X + Y. */
266 fxybest = -1;
267 for (d = fmax; d >= fmin; d -= 2)
268 {
269 lin x = MIN (fd[d], xlim);
270 lin y = x - d;
271 if (ylim < y)
272 x = ylim + d, y = ylim;
273 if (fxybest < x + y)
274 {
275 fxybest = x + y;
276 fxbest = x;
277 }
278 }
279
280 /* Find backward diagonal that minimizes X + Y. */
281 bxybest = LIN_MAX;
282 for (d = bmax; d >= bmin; d -= 2)
283 {
284 lin x = MAX (xoff, bd[d]);
285 lin y = x - d;
286 if (y < yoff)
287 x = yoff + d, y = yoff;
288 if (x + y < bxybest)
289 {
290 bxybest = x + y;
291 bxbest = x;
292 }
293 }
294
295 /* Use the better of the two diagonals. */
296 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
297 {
298 part->xmid = fxbest;
299 part->ymid = fxybest - fxbest;
300 part->lo_minimal = true;
301 part->hi_minimal = false;
302 }
303 else
304 {
305 part->xmid = bxbest;
306 part->ymid = bxybest - bxbest;
307 part->lo_minimal = false;
308 part->hi_minimal = true;
309 }
310 return;
311 }
312 }
313 }
314
315 /* Compare in detail contiguous subsequences of the two files
316 which are known, as a whole, to match each other.
317
318 The results are recorded in the vectors files[N].changed, by
319 storing 1 in the element for each line that is an insertion or deletion.
320
321 The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
322
323 Note that XLIM, YLIM are exclusive bounds.
324 All line numbers are origin-0 and discarded lines are not counted.
325
326 If FIND_MINIMAL, find a minimal difference no matter how
327 expensive it is. */
328
329 static void
compareseq(lin xoff,lin xlim,lin yoff,lin ylim,bool find_minimal)330 compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal)
331 {
332 lin const *xv = xvec; /* Help the compiler. */
333 lin const *yv = yvec;
334
335 /* Slide down the bottom initial diagonal. */
336 while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
337 ++xoff, ++yoff;
338 /* Slide up the top initial diagonal. */
339 while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
340 --xlim, --ylim;
341
342 /* Handle simple cases. */
343 if (xoff == xlim)
344 while (yoff < ylim)
345 files[1].changed[files[1].realindexes[yoff++]] = 1;
346 else if (yoff == ylim)
347 while (xoff < xlim)
348 files[0].changed[files[0].realindexes[xoff++]] = 1;
349 else
350 {
351 struct partition part;
352
353 /* Find a point of correspondence in the middle of the files. */
354 diag (xoff, xlim, yoff, ylim, find_minimal, &part);
355
356 /* Use the partitions to split this problem into subproblems. */
357 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
358 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
359 }
360 }
361
362 /* Discard lines from one file that have no matches in the other file.
363
364 A line which is discarded will not be considered by the actual
365 comparison algorithm; it will be as if that line were not in the file.
366 The file's `realindexes' table maps virtual line numbers
367 (which don't count the discarded lines) into real line numbers;
368 this is how the actual comparison algorithm produces results
369 that are comprehensible when the discarded lines are counted.
370
371 When we discard a line, we also mark it as a deletion or insertion
372 so that it will be printed in the output. */
373
374 static void
discard_confusing_lines(struct file_data filevec[])375 discard_confusing_lines (struct file_data filevec[])
376 {
377 int f;
378 lin i;
379 char *discarded[2];
380 lin *equiv_count[2];
381 lin *p;
382
383 /* Allocate our results. */
384 p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
385 * (2 * sizeof *p));
386 for (f = 0; f < 2; f++)
387 {
388 filevec[f].undiscarded = p; p += filevec[f].buffered_lines;
389 filevec[f].realindexes = p; p += filevec[f].buffered_lines;
390 }
391
392 /* Set up equiv_count[F][I] as the number of lines in file F
393 that fall in equivalence class I. */
394
395 p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
396 equiv_count[0] = p;
397 equiv_count[1] = p + filevec[0].equiv_max;
398
399 for (i = 0; i < filevec[0].buffered_lines; ++i)
400 ++equiv_count[0][filevec[0].equivs[i]];
401 for (i = 0; i < filevec[1].buffered_lines; ++i)
402 ++equiv_count[1][filevec[1].equivs[i]];
403
404 /* Set up tables of which lines are going to be discarded. */
405
406 discarded[0] = zalloc (filevec[0].buffered_lines
407 + filevec[1].buffered_lines);
408 discarded[1] = discarded[0] + filevec[0].buffered_lines;
409
410 /* Mark to be discarded each line that matches no line of the other file.
411 If a line matches many lines, mark it as provisionally discardable. */
412
413 for (f = 0; f < 2; f++)
414 {
415 size_t end = filevec[f].buffered_lines;
416 char *discards = discarded[f];
417 lin *counts = equiv_count[1 - f];
418 lin *equivs = filevec[f].equivs;
419 size_t many = 5;
420 size_t tem = end / 64;
421
422 /* Multiply MANY by approximate square root of number of lines.
423 That is the threshold for provisionally discardable lines. */
424 while ((tem = tem >> 2) > 0)
425 many *= 2;
426
427 for (i = 0; i < end; i++)
428 {
429 lin nmatch;
430 if (equivs[i] == 0)
431 continue;
432 nmatch = counts[equivs[i]];
433 if (nmatch == 0)
434 discards[i] = 1;
435 else if (nmatch > many)
436 discards[i] = 2;
437 }
438 }
439
440 /* Don't really discard the provisional lines except when they occur
441 in a run of discardables, with nonprovisionals at the beginning
442 and end. */
443
444 for (f = 0; f < 2; f++)
445 {
446 lin end = filevec[f].buffered_lines;
447 register char *discards = discarded[f];
448
449 for (i = 0; i < end; i++)
450 {
451 /* Cancel provisional discards not in middle of run of discards. */
452 if (discards[i] == 2)
453 discards[i] = 0;
454 else if (discards[i] != 0)
455 {
456 /* We have found a nonprovisional discard. */
457 register lin j;
458 lin length;
459 lin provisional = 0;
460
461 /* Find end of this run of discardable lines.
462 Count how many are provisionally discardable. */
463 for (j = i; j < end; j++)
464 {
465 if (discards[j] == 0)
466 break;
467 if (discards[j] == 2)
468 ++provisional;
469 }
470
471 /* Cancel provisional discards at end, and shrink the run. */
472 while (j > i && discards[j - 1] == 2)
473 discards[--j] = 0, --provisional;
474
475 /* Now we have the length of a run of discardable lines
476 whose first and last are not provisional. */
477 length = j - i;
478
479 /* If 1/4 of the lines in the run are provisional,
480 cancel discarding of all provisional lines in the run. */
481 if (provisional * 4 > length)
482 {
483 while (j > i)
484 if (discards[--j] == 2)
485 discards[j] = 0;
486 }
487 else
488 {
489 register lin consec;
490 lin minimum = 1;
491 lin tem = length >> 2;
492
493 /* MINIMUM is approximate square root of LENGTH/4.
494 A subrun of two or more provisionals can stand
495 when LENGTH is at least 16.
496 A subrun of 4 or more can stand when LENGTH >= 64. */
497 while (0 < (tem >>= 2))
498 minimum <<= 1;
499 minimum++;
500
501 /* Cancel any subrun of MINIMUM or more provisionals
502 within the larger run. */
503 for (j = 0, consec = 0; j < length; j++)
504 if (discards[i + j] != 2)
505 consec = 0;
506 else if (minimum == ++consec)
507 /* Back up to start of subrun, to cancel it all. */
508 j -= consec;
509 else if (minimum < consec)
510 discards[i + j] = 0;
511
512 /* Scan from beginning of run
513 until we find 3 or more nonprovisionals in a row
514 or until the first nonprovisional at least 8 lines in.
515 Until that point, cancel any provisionals. */
516 for (j = 0, consec = 0; j < length; j++)
517 {
518 if (j >= 8 && discards[i + j] == 1)
519 break;
520 if (discards[i + j] == 2)
521 consec = 0, discards[i + j] = 0;
522 else if (discards[i + j] == 0)
523 consec = 0;
524 else
525 consec++;
526 if (consec == 3)
527 break;
528 }
529
530 /* I advances to the last line of the run. */
531 i += length - 1;
532
533 /* Same thing, from end. */
534 for (j = 0, consec = 0; j < length; j++)
535 {
536 if (j >= 8 && discards[i - j] == 1)
537 break;
538 if (discards[i - j] == 2)
539 consec = 0, discards[i - j] = 0;
540 else if (discards[i - j] == 0)
541 consec = 0;
542 else
543 consec++;
544 if (consec == 3)
545 break;
546 }
547 }
548 }
549 }
550 }
551
552 /* Actually discard the lines. */
553 for (f = 0; f < 2; f++)
554 {
555 char *discards = discarded[f];
556 lin end = filevec[f].buffered_lines;
557 lin j = 0;
558 for (i = 0; i < end; ++i)
559 if (minimal || discards[i] == 0)
560 {
561 filevec[f].undiscarded[j] = filevec[f].equivs[i];
562 filevec[f].realindexes[j++] = i;
563 }
564 else
565 filevec[f].changed[i] = 1;
566 filevec[f].nondiscarded_lines = j;
567 }
568
569 free (discarded[0]);
570 free (equiv_count[0]);
571 }
572
573 /* Adjust inserts/deletes of identical lines to join changes
574 as much as possible.
575
576 We do something when a run of changed lines include a
577 line at one end and have an excluded, identical line at the other.
578 We are free to choose which identical line is included.
579 `compareseq' usually chooses the one at the beginning,
580 but usually it is cleaner to consider the following identical line
581 to be the "change". */
582
583 static void
shift_boundaries(struct file_data filevec[])584 shift_boundaries (struct file_data filevec[])
585 {
586 int f;
587
588 for (f = 0; f < 2; f++)
589 {
590 char *changed = filevec[f].changed;
591 char *other_changed = filevec[1 - f].changed;
592 lin const *equivs = filevec[f].equivs;
593 lin i = 0;
594 lin j = 0;
595 lin i_end = filevec[f].buffered_lines;
596
597 while (1)
598 {
599 lin runlength, start, corresponding;
600
601 /* Scan forwards to find beginning of another run of changes.
602 Also keep track of the corresponding point in the other file. */
603
604 while (i < i_end && !changed[i])
605 {
606 while (other_changed[j++])
607 continue;
608 i++;
609 }
610
611 if (i == i_end)
612 break;
613
614 start = i;
615
616 /* Find the end of this run of changes. */
617
618 while (changed[++i])
619 continue;
620 while (other_changed[j])
621 j++;
622
623 do
624 {
625 /* Record the length of this run of changes, so that
626 we can later determine whether the run has grown. */
627 runlength = i - start;
628
629 /* Move the changed region back, so long as the
630 previous unchanged line matches the last changed one.
631 This merges with previous changed regions. */
632
633 while (start && equivs[start - 1] == equivs[i - 1])
634 {
635 changed[--start] = 1;
636 changed[--i] = 0;
637 while (changed[start - 1])
638 start--;
639 while (other_changed[--j])
640 continue;
641 }
642
643 /* Set CORRESPONDING to the end of the changed run, at the last
644 point where it corresponds to a changed run in the other file.
645 CORRESPONDING == I_END means no such point has been found. */
646 corresponding = other_changed[j - 1] ? i : i_end;
647
648 /* Move the changed region forward, so long as the
649 first changed line matches the following unchanged one.
650 This merges with following changed regions.
651 Do this second, so that if there are no merges,
652 the changed region is moved forward as far as possible. */
653
654 while (i != i_end && equivs[start] == equivs[i])
655 {
656 changed[start++] = 0;
657 changed[i++] = 1;
658 while (changed[i])
659 i++;
660 while (other_changed[++j])
661 corresponding = i;
662 }
663 }
664 while (runlength != i - start);
665
666 /* If possible, move the fully-merged run of changes
667 back to a corresponding run in the other file. */
668
669 while (corresponding < i)
670 {
671 changed[--start] = 1;
672 changed[--i] = 0;
673 while (other_changed[--j])
674 continue;
675 }
676 }
677 }
678 }
679
680 /* Cons an additional entry onto the front of an edit script OLD.
681 LINE0 and LINE1 are the first affected lines in the two files (origin 0).
682 DELETED is the number of lines deleted here from file 0.
683 INSERTED is the number of lines inserted here in file 1.
684
685 If DELETED is 0 then LINE0 is the number of the line before
686 which the insertion was done; vice versa for INSERTED and LINE1. */
687
688 static struct change *
add_change(lin line0,lin line1,lin deleted,lin inserted,struct change * old)689 add_change (lin line0, lin line1, lin deleted, lin inserted,
690 struct change *old)
691 {
692 struct change *new = xmalloc (sizeof *new);
693
694 new->line0 = line0;
695 new->line1 = line1;
696 new->inserted = inserted;
697 new->deleted = deleted;
698 new->link = old;
699 return new;
700 }
701
702 /* Scan the tables of which lines are inserted and deleted,
703 producing an edit script in reverse order. */
704
705 static struct change *
build_reverse_script(struct file_data const filevec[])706 build_reverse_script (struct file_data const filevec[])
707 {
708 struct change *script = 0;
709 char *changed0 = filevec[0].changed;
710 char *changed1 = filevec[1].changed;
711 lin len0 = filevec[0].buffered_lines;
712 lin len1 = filevec[1].buffered_lines;
713
714 /* Note that changedN[len0] does exist, and is 0. */
715
716 lin i0 = 0, i1 = 0;
717
718 while (i0 < len0 || i1 < len1)
719 {
720 if (changed0[i0] | changed1[i1])
721 {
722 lin line0 = i0, line1 = i1;
723
724 /* Find # lines changed here in each file. */
725 while (changed0[i0]) ++i0;
726 while (changed1[i1]) ++i1;
727
728 /* Record this change. */
729 script = add_change (line0, line1, i0 - line0, i1 - line1, script);
730 }
731
732 /* We have reached lines in the two files that match each other. */
733 i0++, i1++;
734 }
735
736 return script;
737 }
738
739 /* Scan the tables of which lines are inserted and deleted,
740 producing an edit script in forward order. */
741
742 static struct change *
build_script(struct file_data const filevec[])743 build_script (struct file_data const filevec[])
744 {
745 struct change *script = 0;
746 char *changed0 = filevec[0].changed;
747 char *changed1 = filevec[1].changed;
748 lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
749
750 /* Note that changedN[-1] does exist, and is 0. */
751
752 while (i0 >= 0 || i1 >= 0)
753 {
754 if (changed0[i0 - 1] | changed1[i1 - 1])
755 {
756 lin line0 = i0, line1 = i1;
757
758 /* Find # lines changed here in each file. */
759 while (changed0[i0 - 1]) --i0;
760 while (changed1[i1 - 1]) --i1;
761
762 /* Record this change. */
763 script = add_change (i0, i1, line0 - i0, line1 - i1, script);
764 }
765
766 /* We have reached lines in the two files that match each other. */
767 i0--, i1--;
768 }
769
770 return script;
771 }
772
773 /* If CHANGES, briefly report that two files differed.
774 Return 2 if trouble, CHANGES otherwise. */
775 static int
briefly_report(int changes,struct file_data const filevec[])776 briefly_report (int changes, struct file_data const filevec[])
777 {
778 if (changes)
779 {
780 char const *label0 = file_label[0] ? file_label[0] : filevec[0].name;
781 char const *label1 = file_label[1] ? file_label[1] : filevec[1].name;
782 message ("Files %s and %s differ\n", label0, label1);
783 if (! brief)
784 changes = 2;
785 }
786
787 return changes;
788 }
789
790 /* Report the differences of two files. */
791 int
diff_2_files(struct comparison * cmp)792 diff_2_files (struct comparison *cmp)
793 {
794 lin diags;
795 int f;
796 struct change *e, *p;
797 struct change *script;
798 int changes;
799
800
801 /* If we have detected that either file is binary,
802 compare the two files as binary. This can happen
803 only when the first chunk is read.
804 Also, --brief without any --ignore-* options means
805 we can speed things up by treating the files as binary. */
806
807 if (read_files (cmp->file, files_can_be_treated_as_binary))
808 {
809 /* Files with different lengths must be different. */
810 if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
811 && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
812 && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
813 changes = 1;
814
815 /* Standard input equals itself. */
816 else if (cmp->file[0].desc == cmp->file[1].desc)
817 changes = 0;
818
819 else
820 /* Scan both files, a buffer at a time, looking for a difference. */
821 {
822 /* Allocate same-sized buffers for both files. */
823 size_t lcm_max = PTRDIFF_MAX - 1;
824 size_t buffer_size =
825 buffer_lcm (sizeof (word),
826 buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
827 STAT_BLOCKSIZE (cmp->file[1].stat),
828 lcm_max),
829 lcm_max);
830 for (f = 0; f < 2; f++)
831 cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
832
833 for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
834 {
835 /* Read a buffer's worth from both files. */
836 for (f = 0; f < 2; f++)
837 if (0 <= cmp->file[f].desc)
838 file_block_read (&cmp->file[f],
839 buffer_size - cmp->file[f].buffered);
840
841 /* If the buffers differ, the files differ. */
842 if (cmp->file[0].buffered != cmp->file[1].buffered
843 || memcmp (cmp->file[0].buffer,
844 cmp->file[1].buffer,
845 cmp->file[0].buffered))
846 {
847 changes = 1;
848 break;
849 }
850
851 /* If we reach end of file, the files are the same. */
852 if (cmp->file[0].buffered != buffer_size)
853 {
854 changes = 0;
855 break;
856 }
857 }
858 }
859
860 changes = briefly_report (changes, cmp->file);
861 }
862 else
863 {
864 /* Allocate vectors for the results of comparison:
865 a flag for each line of each file, saying whether that line
866 is an insertion or deletion.
867 Allocate an extra element, always 0, at each end of each vector. */
868
869 size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
870 char *flag_space = zalloc (s);
871 cmp->file[0].changed = flag_space + 1;
872 cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
873
874 /* Some lines are obviously insertions or deletions
875 because they don't match anything. Detect them now, and
876 avoid even thinking about them in the main comparison algorithm. */
877
878 discard_confusing_lines (cmp->file);
879
880 /* Now do the main comparison algorithm, considering just the
881 undiscarded lines. */
882
883 xvec = cmp->file[0].undiscarded;
884 yvec = cmp->file[1].undiscarded;
885 diags = (cmp->file[0].nondiscarded_lines
886 + cmp->file[1].nondiscarded_lines + 3);
887 fdiag = xmalloc (diags * (2 * sizeof *fdiag));
888 bdiag = fdiag + diags;
889 fdiag += cmp->file[1].nondiscarded_lines + 1;
890 bdiag += cmp->file[1].nondiscarded_lines + 1;
891
892 /* Set TOO_EXPENSIVE to be approximate square root of input size,
893 bounded below by 256. */
894 too_expensive = 1;
895 for (; diags != 0; diags >>= 2)
896 too_expensive <<= 1;
897 too_expensive = MAX (256, too_expensive);
898
899 files[0] = cmp->file[0];
900 files[1] = cmp->file[1];
901
902 compareseq (0, cmp->file[0].nondiscarded_lines,
903 0, cmp->file[1].nondiscarded_lines, minimal);
904
905 free (fdiag - (cmp->file[1].nondiscarded_lines + 1));
906
907 /* Modify the results slightly to make them prettier
908 in cases where that can validly be done. */
909
910 shift_boundaries (cmp->file);
911
912 /* Get the results of comparison in the form of a chain
913 of `struct change's -- an edit script. */
914
915 if (output_style == OUTPUT_ED)
916 script = build_reverse_script (cmp->file);
917 else
918 script = build_script (cmp->file);
919
920 /* Set CHANGES if we had any diffs.
921 If some changes are ignored, we must scan the script to decide. */
922 if (ignore_blank_lines || ignore_regexp.fastmap)
923 {
924 struct change *next = script;
925 changes = 0;
926
927 while (next && changes == 0)
928 {
929 struct change *this, *end;
930 lin first0, last0, first1, last1;
931
932 /* Find a set of changes that belong together. */
933 this = next;
934 end = find_change (next);
935
936 /* Disconnect them from the rest of the changes, making them
937 a hunk, and remember the rest for next iteration. */
938 next = end->link;
939 end->link = 0;
940
941 /* Determine whether this hunk is really a difference. */
942 if (analyze_hunk (this, &first0, &last0, &first1, &last1))
943 changes = 1;
944
945 /* Reconnect the script so it will all be freed properly. */
946 end->link = next;
947 }
948 }
949 else
950 changes = (script != 0);
951
952 if (brief)
953 changes = briefly_report (changes, cmp->file);
954 else
955 {
956 if (changes | !no_diff_means_no_output)
957 {
958 /* Record info for starting up output,
959 to be used if and when we have some output to print. */
960 setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
961 file_label[1] ? file_label[1] : cmp->file[1].name,
962 cmp->parent != 0);
963
964 switch (output_style)
965 {
966 case OUTPUT_CONTEXT:
967 print_context_script (script, false);
968 break;
969
970 case OUTPUT_UNIFIED:
971 print_context_script (script, true);
972 break;
973
974 case OUTPUT_ED:
975 print_ed_script (script);
976 break;
977
978 case OUTPUT_FORWARD_ED:
979 pr_forward_ed_script (script);
980 break;
981
982 case OUTPUT_RCS:
983 print_rcs_script (script);
984 break;
985
986 case OUTPUT_NORMAL:
987 print_normal_script (script);
988 break;
989
990 case OUTPUT_IFDEF:
991 print_ifdef_script (script);
992 break;
993
994 case OUTPUT_SDIFF:
995 print_sdiff_script (script);
996 break;
997
998 default:
999 abort ();
1000 }
1001
1002 finish_output ();
1003 }
1004 }
1005
1006 free (cmp->file[0].undiscarded);
1007
1008 free (flag_space);
1009
1010 for (f = 0; f < 2; f++)
1011 {
1012 free (cmp->file[f].equivs);
1013 free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
1014 }
1015
1016 for (e = script; e; e = p)
1017 {
1018 p = e->link;
1019 free (e);
1020 }
1021
1022 if (! ROBUST_OUTPUT_STYLE (output_style))
1023 for (f = 0; f < 2; ++f)
1024 if (cmp->file[f].missing_newline)
1025 {
1026 error (0, 0, "%s: %s\n",
1027 file_label[f] ? file_label[f] : cmp->file[f].name,
1028 _("No newline at end of file"));
1029 changes = 2;
1030 }
1031 }
1032
1033 if (cmp->file[0].buffer != cmp->file[1].buffer)
1034 free (cmp->file[0].buffer);
1035 free (cmp->file[1].buffer);
1036
1037 return changes;
1038 }
1039