xref: /freebsd/contrib/diff/src/analyze.c (revision 6be3386466ab79a84b48429ae66244f21526d3df)
1 /* Analyze file differences for GNU DIFF.
2 
3    Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
4    2004 Free Software Foundation, Inc.
5 
6    This file is part of GNU DIFF.
7 
8    GNU DIFF is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 2, or (at your option)
11    any later version.
12 
13    GNU DIFF is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program; see the file COPYING.
20    If not, write to the Free Software Foundation,
21    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
22 
23 /* The basic algorithm is described in:
24    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
25    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
26    see especially section 4.2, which describes the variation used below.
27    Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
28    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
29    at the price of producing suboptimal output for large inputs with
30    many differences.
31 
32    The basic algorithm was independently discovered as described in:
33    "Algorithms for Approximate String Matching", E. Ukkonen,
34    Information and Control Vol. 64, 1985, pp. 100-118.  */
35 
36 #include "diff.h"
37 #include <cmpbuf.h>
38 #include <error.h>
39 #include <file-type.h>
40 #include <xalloc.h>
41 
42 static lin *xvec, *yvec;	/* Vectors being compared. */
43 static lin *fdiag;		/* Vector, indexed by diagonal, containing
44 				   1 + the X coordinate of the point furthest
45 				   along the given diagonal in the forward
46 				   search of the edit matrix. */
47 static lin *bdiag;		/* Vector, indexed by diagonal, containing
48 				   the X coordinate of the point furthest
49 				   along the given diagonal in the backward
50 				   search of the edit matrix. */
51 static lin too_expensive;	/* Edit scripts longer than this are too
52 				   expensive to compute.  */
53 
54 #define SNAKE_LIMIT 20	/* Snakes bigger than this are considered `big'.  */
55 
56 struct partition
57 {
58   lin xmid, ymid;	/* Midpoints of this partition.  */
59   bool lo_minimal;	/* Nonzero if low half will be analyzed minimally.  */
60   bool hi_minimal;	/* Likewise for high half.  */
61 };
62 
63 /* Find the midpoint of the shortest edit script for a specified
64    portion of the two files.
65 
66    Scan from the beginnings of the files, and simultaneously from the ends,
67    doing a breadth-first search through the space of edit-sequence.
68    When the two searches meet, we have found the midpoint of the shortest
69    edit sequence.
70 
71    If FIND_MINIMAL is nonzero, find the minimal edit script regardless
72    of expense.  Otherwise, if the search is too expensive, use
73    heuristics to stop the search and report a suboptimal answer.
74 
75    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
76    XMID - YMID equals the number of inserted lines minus the number
77    of deleted lines (counting only lines before the midpoint).
78 
79    Set PART->lo_minimal to true iff the minimal edit script for the
80    left half of the partition is known; similarly for PART->hi_minimal.
81 
82    This function assumes that the first lines of the specified portions
83    of the two files do not match, and likewise that the last lines do not
84    match.  The caller must trim matching lines from the beginning and end
85    of the portions it is going to specify.
86 
87    If we return the "wrong" partitions,
88    the worst this can do is cause suboptimal diff output.
89    It cannot cause incorrect diff output.  */
90 
91 static void
92 diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal,
93       struct partition *part)
94 {
95   lin *const fd = fdiag;	/* Give the compiler a chance. */
96   lin *const bd = bdiag;	/* Additional help for the compiler. */
97   lin const *const xv = xvec;	/* Still more help for the compiler. */
98   lin const *const yv = yvec;	/* And more and more . . . */
99   lin const dmin = xoff - ylim;	/* Minimum valid diagonal. */
100   lin const dmax = xlim - yoff;	/* Maximum valid diagonal. */
101   lin const fmid = xoff - yoff;	/* Center diagonal of top-down search. */
102   lin const bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
103   lin fmin = fmid, fmax = fmid;	/* Limits of top-down search. */
104   lin bmin = bmid, bmax = bmid;	/* Limits of bottom-up search. */
105   lin c;			/* Cost. */
106   bool odd = (fmid - bmid) & 1;	/* True if southeast corner is on an odd
107 				   diagonal with respect to the northwest. */
108 
109   fd[fmid] = xoff;
110   bd[bmid] = xlim;
111 
112   for (c = 1;; ++c)
113     {
114       lin d;			/* Active diagonal. */
115       bool big_snake = false;
116 
117       /* Extend the top-down search by an edit step in each diagonal. */
118       fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
119       fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
120       for (d = fmax; d >= fmin; d -= 2)
121 	{
122 	  lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
123 
124 	  if (tlo >= thi)
125 	    x = tlo + 1;
126 	  else
127 	    x = thi;
128 	  oldx = x;
129 	  y = x - d;
130 	  while (x < xlim && y < ylim && xv[x] == yv[y])
131 	    ++x, ++y;
132 	  if (x - oldx > SNAKE_LIMIT)
133 	    big_snake = true;
134 	  fd[d] = x;
135 	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
136 	    {
137 	      part->xmid = x;
138 	      part->ymid = y;
139 	      part->lo_minimal = part->hi_minimal = true;
140 	      return;
141 	    }
142 	}
143 
144       /* Similarly extend the bottom-up search.  */
145       bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin;
146       bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax;
147       for (d = bmax; d >= bmin; d -= 2)
148 	{
149 	  lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
150 
151 	  if (tlo < thi)
152 	    x = tlo;
153 	  else
154 	    x = thi - 1;
155 	  oldx = x;
156 	  y = x - d;
157 	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
158 	    --x, --y;
159 	  if (oldx - x > SNAKE_LIMIT)
160 	    big_snake = true;
161 	  bd[d] = x;
162 	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
163 	    {
164 	      part->xmid = x;
165 	      part->ymid = y;
166 	      part->lo_minimal = part->hi_minimal = true;
167 	      return;
168 	    }
169 	}
170 
171       if (find_minimal)
172 	continue;
173 
174       /* Heuristic: check occasionally for a diagonal that has made
175 	 lots of progress compared with the edit distance.
176 	 If we have any such, find the one that has made the most
177 	 progress and return it as if it had succeeded.
178 
179 	 With this heuristic, for files with a constant small density
180 	 of changes, the algorithm is linear in the file size.  */
181 
182       if (200 < c && big_snake && speed_large_files)
183 	{
184 	  lin best = 0;
185 
186 	  for (d = fmax; d >= fmin; d -= 2)
187 	    {
188 	      lin dd = d - fmid;
189 	      lin x = fd[d];
190 	      lin y = x - d;
191 	      lin v = (x - xoff) * 2 - dd;
192 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
193 		{
194 		  if (v > best
195 		      && xoff + SNAKE_LIMIT <= x && x < xlim
196 		      && yoff + SNAKE_LIMIT <= y && y < ylim)
197 		    {
198 		      /* We have a good enough best diagonal;
199 			 now insist that it end with a significant snake.  */
200 		      int k;
201 
202 		      for (k = 1; xv[x - k] == yv[y - k]; k++)
203 			if (k == SNAKE_LIMIT)
204 			  {
205 			    best = v;
206 			    part->xmid = x;
207 			    part->ymid = y;
208 			    break;
209 			  }
210 		    }
211 		}
212 	    }
213 	  if (best > 0)
214 	    {
215 	      part->lo_minimal = true;
216 	      part->hi_minimal = false;
217 	      return;
218 	    }
219 
220 	  best = 0;
221 	  for (d = bmax; d >= bmin; d -= 2)
222 	    {
223 	      lin dd = d - bmid;
224 	      lin x = bd[d];
225 	      lin y = x - d;
226 	      lin v = (xlim - x) * 2 + dd;
227 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
228 		{
229 		  if (v > best
230 		      && xoff < x && x <= xlim - SNAKE_LIMIT
231 		      && yoff < y && y <= ylim - SNAKE_LIMIT)
232 		    {
233 		      /* We have a good enough best diagonal;
234 			 now insist that it end with a significant snake.  */
235 		      int k;
236 
237 		      for (k = 0; xv[x + k] == yv[y + k]; k++)
238 			if (k == SNAKE_LIMIT - 1)
239 			  {
240 			    best = v;
241 			    part->xmid = x;
242 			    part->ymid = y;
243 			    break;
244 			  }
245 		    }
246 		}
247 	    }
248 	  if (best > 0)
249 	    {
250 	      part->lo_minimal = false;
251 	      part->hi_minimal = true;
252 	      return;
253 	    }
254 	}
255 
256       /* Heuristic: if we've gone well beyond the call of duty,
257 	 give up and report halfway between our best results so far.  */
258       if (c >= too_expensive)
259 	{
260 	  lin fxybest, fxbest;
261 	  lin bxybest, bxbest;
262 
263 	  fxbest = bxbest = 0;  /* Pacify `gcc -Wall'.  */
264 
265 	  /* Find forward diagonal that maximizes X + Y.  */
266 	  fxybest = -1;
267 	  for (d = fmax; d >= fmin; d -= 2)
268 	    {
269 	      lin x = MIN (fd[d], xlim);
270 	      lin y = x - d;
271 	      if (ylim < y)
272 		x = ylim + d, y = ylim;
273 	      if (fxybest < x + y)
274 		{
275 		  fxybest = x + y;
276 		  fxbest = x;
277 		}
278 	    }
279 
280 	  /* Find backward diagonal that minimizes X + Y.  */
281 	  bxybest = LIN_MAX;
282 	  for (d = bmax; d >= bmin; d -= 2)
283 	    {
284 	      lin x = MAX (xoff, bd[d]);
285 	      lin y = x - d;
286 	      if (y < yoff)
287 		x = yoff + d, y = yoff;
288 	      if (x + y < bxybest)
289 		{
290 		  bxybest = x + y;
291 		  bxbest = x;
292 		}
293 	    }
294 
295 	  /* Use the better of the two diagonals.  */
296 	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
297 	    {
298 	      part->xmid = fxbest;
299 	      part->ymid = fxybest - fxbest;
300 	      part->lo_minimal = true;
301 	      part->hi_minimal = false;
302 	    }
303 	  else
304 	    {
305 	      part->xmid = bxbest;
306 	      part->ymid = bxybest - bxbest;
307 	      part->lo_minimal = false;
308 	      part->hi_minimal = true;
309 	    }
310 	  return;
311 	}
312     }
313 }
314 
315 /* Compare in detail contiguous subsequences of the two files
316    which are known, as a whole, to match each other.
317 
318    The results are recorded in the vectors files[N].changed, by
319    storing 1 in the element for each line that is an insertion or deletion.
320 
321    The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
322 
323    Note that XLIM, YLIM are exclusive bounds.
324    All line numbers are origin-0 and discarded lines are not counted.
325 
326    If FIND_MINIMAL, find a minimal difference no matter how
327    expensive it is.  */
328 
329 static void
330 compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal)
331 {
332   lin const *xv = xvec; /* Help the compiler.  */
333   lin const *yv = yvec;
334 
335   /* Slide down the bottom initial diagonal. */
336   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
337     ++xoff, ++yoff;
338   /* Slide up the top initial diagonal. */
339   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
340     --xlim, --ylim;
341 
342   /* Handle simple cases. */
343   if (xoff == xlim)
344     while (yoff < ylim)
345       files[1].changed[files[1].realindexes[yoff++]] = 1;
346   else if (yoff == ylim)
347     while (xoff < xlim)
348       files[0].changed[files[0].realindexes[xoff++]] = 1;
349   else
350     {
351       struct partition part;
352 
353       /* Find a point of correspondence in the middle of the files.  */
354       diag (xoff, xlim, yoff, ylim, find_minimal, &part);
355 
356       /* Use the partitions to split this problem into subproblems.  */
357       compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
358       compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
359     }
360 }
361 
362 /* Discard lines from one file that have no matches in the other file.
363 
364    A line which is discarded will not be considered by the actual
365    comparison algorithm; it will be as if that line were not in the file.
366    The file's `realindexes' table maps virtual line numbers
367    (which don't count the discarded lines) into real line numbers;
368    this is how the actual comparison algorithm produces results
369    that are comprehensible when the discarded lines are counted.
370 
371    When we discard a line, we also mark it as a deletion or insertion
372    so that it will be printed in the output.  */
373 
374 static void
375 discard_confusing_lines (struct file_data filevec[])
376 {
377   int f;
378   lin i;
379   char *discarded[2];
380   lin *equiv_count[2];
381   lin *p;
382 
383   /* Allocate our results.  */
384   p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
385 	       * (2 * sizeof *p));
386   for (f = 0; f < 2; f++)
387     {
388       filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
389       filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
390     }
391 
392   /* Set up equiv_count[F][I] as the number of lines in file F
393      that fall in equivalence class I.  */
394 
395   p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
396   equiv_count[0] = p;
397   equiv_count[1] = p + filevec[0].equiv_max;
398 
399   for (i = 0; i < filevec[0].buffered_lines; ++i)
400     ++equiv_count[0][filevec[0].equivs[i]];
401   for (i = 0; i < filevec[1].buffered_lines; ++i)
402     ++equiv_count[1][filevec[1].equivs[i]];
403 
404   /* Set up tables of which lines are going to be discarded.  */
405 
406   discarded[0] = zalloc (filevec[0].buffered_lines
407 			 + filevec[1].buffered_lines);
408   discarded[1] = discarded[0] + filevec[0].buffered_lines;
409 
410   /* Mark to be discarded each line that matches no line of the other file.
411      If a line matches many lines, mark it as provisionally discardable.  */
412 
413   for (f = 0; f < 2; f++)
414     {
415       size_t end = filevec[f].buffered_lines;
416       char *discards = discarded[f];
417       lin *counts = equiv_count[1 - f];
418       lin *equivs = filevec[f].equivs;
419       size_t many = 5;
420       size_t tem = end / 64;
421 
422       /* Multiply MANY by approximate square root of number of lines.
423 	 That is the threshold for provisionally discardable lines.  */
424       while ((tem = tem >> 2) > 0)
425 	many *= 2;
426 
427       for (i = 0; i < end; i++)
428 	{
429 	  lin nmatch;
430 	  if (equivs[i] == 0)
431 	    continue;
432 	  nmatch = counts[equivs[i]];
433 	  if (nmatch == 0)
434 	    discards[i] = 1;
435 	  else if (nmatch > many)
436 	    discards[i] = 2;
437 	}
438     }
439 
440   /* Don't really discard the provisional lines except when they occur
441      in a run of discardables, with nonprovisionals at the beginning
442      and end.  */
443 
444   for (f = 0; f < 2; f++)
445     {
446       lin end = filevec[f].buffered_lines;
447       register char *discards = discarded[f];
448 
449       for (i = 0; i < end; i++)
450 	{
451 	  /* Cancel provisional discards not in middle of run of discards.  */
452 	  if (discards[i] == 2)
453 	    discards[i] = 0;
454 	  else if (discards[i] != 0)
455 	    {
456 	      /* We have found a nonprovisional discard.  */
457 	      register lin j;
458 	      lin length;
459 	      lin provisional = 0;
460 
461 	      /* Find end of this run of discardable lines.
462 		 Count how many are provisionally discardable.  */
463 	      for (j = i; j < end; j++)
464 		{
465 		  if (discards[j] == 0)
466 		    break;
467 		  if (discards[j] == 2)
468 		    ++provisional;
469 		}
470 
471 	      /* Cancel provisional discards at end, and shrink the run.  */
472 	      while (j > i && discards[j - 1] == 2)
473 		discards[--j] = 0, --provisional;
474 
475 	      /* Now we have the length of a run of discardable lines
476 		 whose first and last are not provisional.  */
477 	      length = j - i;
478 
479 	      /* If 1/4 of the lines in the run are provisional,
480 		 cancel discarding of all provisional lines in the run.  */
481 	      if (provisional * 4 > length)
482 		{
483 		  while (j > i)
484 		    if (discards[--j] == 2)
485 		      discards[j] = 0;
486 		}
487 	      else
488 		{
489 		  register lin consec;
490 		  lin minimum = 1;
491 		  lin tem = length >> 2;
492 
493 		  /* MINIMUM is approximate square root of LENGTH/4.
494 		     A subrun of two or more provisionals can stand
495 		     when LENGTH is at least 16.
496 		     A subrun of 4 or more can stand when LENGTH >= 64.  */
497 		  while (0 < (tem >>= 2))
498 		    minimum <<= 1;
499 		  minimum++;
500 
501 		  /* Cancel any subrun of MINIMUM or more provisionals
502 		     within the larger run.  */
503 		  for (j = 0, consec = 0; j < length; j++)
504 		    if (discards[i + j] != 2)
505 		      consec = 0;
506 		    else if (minimum == ++consec)
507 		      /* Back up to start of subrun, to cancel it all.  */
508 		      j -= consec;
509 		    else if (minimum < consec)
510 		      discards[i + j] = 0;
511 
512 		  /* Scan from beginning of run
513 		     until we find 3 or more nonprovisionals in a row
514 		     or until the first nonprovisional at least 8 lines in.
515 		     Until that point, cancel any provisionals.  */
516 		  for (j = 0, consec = 0; j < length; j++)
517 		    {
518 		      if (j >= 8 && discards[i + j] == 1)
519 			break;
520 		      if (discards[i + j] == 2)
521 			consec = 0, discards[i + j] = 0;
522 		      else if (discards[i + j] == 0)
523 			consec = 0;
524 		      else
525 			consec++;
526 		      if (consec == 3)
527 			break;
528 		    }
529 
530 		  /* I advances to the last line of the run.  */
531 		  i += length - 1;
532 
533 		  /* Same thing, from end.  */
534 		  for (j = 0, consec = 0; j < length; j++)
535 		    {
536 		      if (j >= 8 && discards[i - j] == 1)
537 			break;
538 		      if (discards[i - j] == 2)
539 			consec = 0, discards[i - j] = 0;
540 		      else if (discards[i - j] == 0)
541 			consec = 0;
542 		      else
543 			consec++;
544 		      if (consec == 3)
545 			break;
546 		    }
547 		}
548 	    }
549 	}
550     }
551 
552   /* Actually discard the lines. */
553   for (f = 0; f < 2; f++)
554     {
555       char *discards = discarded[f];
556       lin end = filevec[f].buffered_lines;
557       lin j = 0;
558       for (i = 0; i < end; ++i)
559 	if (minimal || discards[i] == 0)
560 	  {
561 	    filevec[f].undiscarded[j] = filevec[f].equivs[i];
562 	    filevec[f].realindexes[j++] = i;
563 	  }
564 	else
565 	  filevec[f].changed[i] = 1;
566       filevec[f].nondiscarded_lines = j;
567     }
568 
569   free (discarded[0]);
570   free (equiv_count[0]);
571 }
572 
573 /* Adjust inserts/deletes of identical lines to join changes
574    as much as possible.
575 
576    We do something when a run of changed lines include a
577    line at one end and have an excluded, identical line at the other.
578    We are free to choose which identical line is included.
579    `compareseq' usually chooses the one at the beginning,
580    but usually it is cleaner to consider the following identical line
581    to be the "change".  */
582 
583 static void
584 shift_boundaries (struct file_data filevec[])
585 {
586   int f;
587 
588   for (f = 0; f < 2; f++)
589     {
590       char *changed = filevec[f].changed;
591       char *other_changed = filevec[1 - f].changed;
592       lin const *equivs = filevec[f].equivs;
593       lin i = 0;
594       lin j = 0;
595       lin i_end = filevec[f].buffered_lines;
596 
597       while (1)
598 	{
599 	  lin runlength, start, corresponding;
600 
601 	  /* Scan forwards to find beginning of another run of changes.
602 	     Also keep track of the corresponding point in the other file.  */
603 
604 	  while (i < i_end && !changed[i])
605 	    {
606 	      while (other_changed[j++])
607 		continue;
608 	      i++;
609 	    }
610 
611 	  if (i == i_end)
612 	    break;
613 
614 	  start = i;
615 
616 	  /* Find the end of this run of changes.  */
617 
618 	  while (changed[++i])
619 	    continue;
620 	  while (other_changed[j])
621 	    j++;
622 
623 	  do
624 	    {
625 	      /* Record the length of this run of changes, so that
626 		 we can later determine whether the run has grown.  */
627 	      runlength = i - start;
628 
629 	      /* Move the changed region back, so long as the
630 		 previous unchanged line matches the last changed one.
631 		 This merges with previous changed regions.  */
632 
633 	      while (start && equivs[start - 1] == equivs[i - 1])
634 		{
635 		  changed[--start] = 1;
636 		  changed[--i] = 0;
637 		  while (changed[start - 1])
638 		    start--;
639 		  while (other_changed[--j])
640 		    continue;
641 		}
642 
643 	      /* Set CORRESPONDING to the end of the changed run, at the last
644 		 point where it corresponds to a changed run in the other file.
645 		 CORRESPONDING == I_END means no such point has been found.  */
646 	      corresponding = other_changed[j - 1] ? i : i_end;
647 
648 	      /* Move the changed region forward, so long as the
649 		 first changed line matches the following unchanged one.
650 		 This merges with following changed regions.
651 		 Do this second, so that if there are no merges,
652 		 the changed region is moved forward as far as possible.  */
653 
654 	      while (i != i_end && equivs[start] == equivs[i])
655 		{
656 		  changed[start++] = 0;
657 		  changed[i++] = 1;
658 		  while (changed[i])
659 		    i++;
660 		  while (other_changed[++j])
661 		    corresponding = i;
662 		}
663 	    }
664 	  while (runlength != i - start);
665 
666 	  /* If possible, move the fully-merged run of changes
667 	     back to a corresponding run in the other file.  */
668 
669 	  while (corresponding < i)
670 	    {
671 	      changed[--start] = 1;
672 	      changed[--i] = 0;
673 	      while (other_changed[--j])
674 		continue;
675 	    }
676 	}
677     }
678 }
679 
680 /* Cons an additional entry onto the front of an edit script OLD.
681    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
682    DELETED is the number of lines deleted here from file 0.
683    INSERTED is the number of lines inserted here in file 1.
684 
685    If DELETED is 0 then LINE0 is the number of the line before
686    which the insertion was done; vice versa for INSERTED and LINE1.  */
687 
688 static struct change *
689 add_change (lin line0, lin line1, lin deleted, lin inserted,
690 	    struct change *old)
691 {
692   struct change *new = xmalloc (sizeof *new);
693 
694   new->line0 = line0;
695   new->line1 = line1;
696   new->inserted = inserted;
697   new->deleted = deleted;
698   new->link = old;
699   return new;
700 }
701 
702 /* Scan the tables of which lines are inserted and deleted,
703    producing an edit script in reverse order.  */
704 
705 static struct change *
706 build_reverse_script (struct file_data const filevec[])
707 {
708   struct change *script = 0;
709   char *changed0 = filevec[0].changed;
710   char *changed1 = filevec[1].changed;
711   lin len0 = filevec[0].buffered_lines;
712   lin len1 = filevec[1].buffered_lines;
713 
714   /* Note that changedN[len0] does exist, and is 0.  */
715 
716   lin i0 = 0, i1 = 0;
717 
718   while (i0 < len0 || i1 < len1)
719     {
720       if (changed0[i0] | changed1[i1])
721 	{
722 	  lin line0 = i0, line1 = i1;
723 
724 	  /* Find # lines changed here in each file.  */
725 	  while (changed0[i0]) ++i0;
726 	  while (changed1[i1]) ++i1;
727 
728 	  /* Record this change.  */
729 	  script = add_change (line0, line1, i0 - line0, i1 - line1, script);
730 	}
731 
732       /* We have reached lines in the two files that match each other.  */
733       i0++, i1++;
734     }
735 
736   return script;
737 }
738 
739 /* Scan the tables of which lines are inserted and deleted,
740    producing an edit script in forward order.  */
741 
742 static struct change *
743 build_script (struct file_data const filevec[])
744 {
745   struct change *script = 0;
746   char *changed0 = filevec[0].changed;
747   char *changed1 = filevec[1].changed;
748   lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
749 
750   /* Note that changedN[-1] does exist, and is 0.  */
751 
752   while (i0 >= 0 || i1 >= 0)
753     {
754       if (changed0[i0 - 1] | changed1[i1 - 1])
755 	{
756 	  lin line0 = i0, line1 = i1;
757 
758 	  /* Find # lines changed here in each file.  */
759 	  while (changed0[i0 - 1]) --i0;
760 	  while (changed1[i1 - 1]) --i1;
761 
762 	  /* Record this change.  */
763 	  script = add_change (i0, i1, line0 - i0, line1 - i1, script);
764 	}
765 
766       /* We have reached lines in the two files that match each other.  */
767       i0--, i1--;
768     }
769 
770   return script;
771 }
772 
773 /* If CHANGES, briefly report that two files differed.
774    Return 2 if trouble, CHANGES otherwise.  */
775 static int
776 briefly_report (int changes, struct file_data const filevec[])
777 {
778   if (changes)
779     {
780       char const *label0 = file_label[0] ? file_label[0] : filevec[0].name;
781       char const *label1 = file_label[1] ? file_label[1] : filevec[1].name;
782       message ("Files %s and %s differ\n", label0, label1);
783       if (! brief)
784 	changes = 2;
785     }
786 
787   return changes;
788 }
789 
790 /* Report the differences of two files.  */
791 int
792 diff_2_files (struct comparison *cmp)
793 {
794   lin diags;
795   int f;
796   struct change *e, *p;
797   struct change *script;
798   int changes;
799 
800 
801   /* If we have detected that either file is binary,
802      compare the two files as binary.  This can happen
803      only when the first chunk is read.
804      Also, --brief without any --ignore-* options means
805      we can speed things up by treating the files as binary.  */
806 
807   if (read_files (cmp->file, files_can_be_treated_as_binary))
808     {
809       /* Files with different lengths must be different.  */
810       if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
811 	  && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
812 	  && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
813 	changes = 1;
814 
815       /* Standard input equals itself.  */
816       else if (cmp->file[0].desc == cmp->file[1].desc)
817 	changes = 0;
818 
819       else
820 	/* Scan both files, a buffer at a time, looking for a difference.  */
821 	{
822 	  /* Allocate same-sized buffers for both files.  */
823 	  size_t lcm_max = PTRDIFF_MAX - 1;
824 	  size_t buffer_size =
825 	    buffer_lcm (sizeof (word),
826 			buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
827 				    STAT_BLOCKSIZE (cmp->file[1].stat),
828 				    lcm_max),
829 			lcm_max);
830 	  for (f = 0; f < 2; f++)
831 	    cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
832 
833 	  for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
834 	    {
835 	      /* Read a buffer's worth from both files.  */
836 	      for (f = 0; f < 2; f++)
837 		if (0 <= cmp->file[f].desc)
838 		  file_block_read (&cmp->file[f],
839 				   buffer_size - cmp->file[f].buffered);
840 
841 	      /* If the buffers differ, the files differ.  */
842 	      if (cmp->file[0].buffered != cmp->file[1].buffered
843 		  || memcmp (cmp->file[0].buffer,
844 			     cmp->file[1].buffer,
845 			     cmp->file[0].buffered))
846 		{
847 		  changes = 1;
848 		  break;
849 		}
850 
851 	      /* If we reach end of file, the files are the same.  */
852 	      if (cmp->file[0].buffered != buffer_size)
853 		{
854 		  changes = 0;
855 		  break;
856 		}
857 	    }
858 	}
859 
860       changes = briefly_report (changes, cmp->file);
861     }
862   else
863     {
864       /* Allocate vectors for the results of comparison:
865 	 a flag for each line of each file, saying whether that line
866 	 is an insertion or deletion.
867 	 Allocate an extra element, always 0, at each end of each vector.  */
868 
869       size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
870       char *flag_space = zalloc (s);
871       cmp->file[0].changed = flag_space + 1;
872       cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
873 
874       /* Some lines are obviously insertions or deletions
875 	 because they don't match anything.  Detect them now, and
876 	 avoid even thinking about them in the main comparison algorithm.  */
877 
878       discard_confusing_lines (cmp->file);
879 
880       /* Now do the main comparison algorithm, considering just the
881 	 undiscarded lines.  */
882 
883       xvec = cmp->file[0].undiscarded;
884       yvec = cmp->file[1].undiscarded;
885       diags = (cmp->file[0].nondiscarded_lines
886 	       + cmp->file[1].nondiscarded_lines + 3);
887       fdiag = xmalloc (diags * (2 * sizeof *fdiag));
888       bdiag = fdiag + diags;
889       fdiag += cmp->file[1].nondiscarded_lines + 1;
890       bdiag += cmp->file[1].nondiscarded_lines + 1;
891 
892       /* Set TOO_EXPENSIVE to be approximate square root of input size,
893 	 bounded below by 256.  */
894       too_expensive = 1;
895       for (;  diags != 0;  diags >>= 2)
896 	too_expensive <<= 1;
897       too_expensive = MAX (256, too_expensive);
898 
899       files[0] = cmp->file[0];
900       files[1] = cmp->file[1];
901 
902       compareseq (0, cmp->file[0].nondiscarded_lines,
903 		  0, cmp->file[1].nondiscarded_lines, minimal);
904 
905       free (fdiag - (cmp->file[1].nondiscarded_lines + 1));
906 
907       /* Modify the results slightly to make them prettier
908 	 in cases where that can validly be done.  */
909 
910       shift_boundaries (cmp->file);
911 
912       /* Get the results of comparison in the form of a chain
913 	 of `struct change's -- an edit script.  */
914 
915       if (output_style == OUTPUT_ED)
916 	script = build_reverse_script (cmp->file);
917       else
918 	script = build_script (cmp->file);
919 
920       /* Set CHANGES if we had any diffs.
921 	 If some changes are ignored, we must scan the script to decide.  */
922       if (ignore_blank_lines || ignore_regexp.fastmap)
923 	{
924 	  struct change *next = script;
925 	  changes = 0;
926 
927 	  while (next && changes == 0)
928 	    {
929 	      struct change *this, *end;
930 	      lin first0, last0, first1, last1;
931 
932 	      /* Find a set of changes that belong together.  */
933 	      this = next;
934 	      end = find_change (next);
935 
936 	      /* Disconnect them from the rest of the changes, making them
937 		 a hunk, and remember the rest for next iteration.  */
938 	      next = end->link;
939 	      end->link = 0;
940 
941 	      /* Determine whether this hunk is really a difference.  */
942 	      if (analyze_hunk (this, &first0, &last0, &first1, &last1))
943 		changes = 1;
944 
945 	      /* Reconnect the script so it will all be freed properly.  */
946 	      end->link = next;
947 	    }
948 	}
949       else
950 	changes = (script != 0);
951 
952       if (brief)
953 	changes = briefly_report (changes, cmp->file);
954       else
955 	{
956 	  if (changes | !no_diff_means_no_output)
957 	    {
958 	      /* Record info for starting up output,
959 		 to be used if and when we have some output to print.  */
960 	      setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
961 			    file_label[1] ? file_label[1] : cmp->file[1].name,
962 			    cmp->parent != 0);
963 
964 	      switch (output_style)
965 		{
966 		case OUTPUT_CONTEXT:
967 		  print_context_script (script, false);
968 		  break;
969 
970 		case OUTPUT_UNIFIED:
971 		  print_context_script (script, true);
972 		  break;
973 
974 		case OUTPUT_ED:
975 		  print_ed_script (script);
976 		  break;
977 
978 		case OUTPUT_FORWARD_ED:
979 		  pr_forward_ed_script (script);
980 		  break;
981 
982 		case OUTPUT_RCS:
983 		  print_rcs_script (script);
984 		  break;
985 
986 		case OUTPUT_NORMAL:
987 		  print_normal_script (script);
988 		  break;
989 
990 		case OUTPUT_IFDEF:
991 		  print_ifdef_script (script);
992 		  break;
993 
994 		case OUTPUT_SDIFF:
995 		  print_sdiff_script (script);
996 		  break;
997 
998 		default:
999 		  abort ();
1000 		}
1001 
1002 	      finish_output ();
1003 	    }
1004 	}
1005 
1006       free (cmp->file[0].undiscarded);
1007 
1008       free (flag_space);
1009 
1010       for (f = 0; f < 2; f++)
1011 	{
1012 	  free (cmp->file[f].equivs);
1013 	  free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
1014 	}
1015 
1016       for (e = script; e; e = p)
1017 	{
1018 	  p = e->link;
1019 	  free (e);
1020 	}
1021 
1022       if (! ROBUST_OUTPUT_STYLE (output_style))
1023 	for (f = 0; f < 2; ++f)
1024 	  if (cmp->file[f].missing_newline)
1025 	    {
1026 	      error (0, 0, "%s: %s\n",
1027 		     file_label[f] ? file_label[f] : cmp->file[f].name,
1028 		     _("No newline at end of file"));
1029 	      changes = 2;
1030 	    }
1031     }
1032 
1033   if (cmp->file[0].buffer != cmp->file[1].buffer)
1034     free (cmp->file[0].buffer);
1035   free (cmp->file[1].buffer);
1036 
1037   return changes;
1038 }
1039