xref: /freebsd/contrib/diff/src/io.c (revision b87e84c58ab6fa908883893553e6920b551bf1b7)
1  /* File I/O for GNU DIFF.
2  
3     Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
4     2004 Free Software Foundation, Inc.
5  
6     This file is part of GNU DIFF.
7  
8     GNU DIFF is free software; you can redistribute it and/or modify
9     it under the terms of the GNU General Public License as published by
10     the Free Software Foundation; either version 2, or (at your option)
11     any later version.
12  
13     GNU DIFF is distributed in the hope that it will be useful,
14     but WITHOUT ANY WARRANTY; without even the implied warranty of
15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16     GNU General Public License for more details.
17  
18     You should have received a copy of the GNU General Public License
19     along with this program; see the file COPYING.
20     If not, write to the Free Software Foundation,
21     59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
22  
23  #include "diff.h"
24  #include <cmpbuf.h>
25  #include <file-type.h>
26  #include <setmode.h>
27  #include <xalloc.h>
28  
29  /* Rotate an unsigned value to the left.  */
30  #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
31  
32  /* Given a hash value and a new character, return a new hash value.  */
33  #define HASH(h, c) ((c) + ROL (h, 7))
34  
35  /* The type of a hash value.  */
36  typedef size_t hash_value;
37  verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
38  
39  /* Lines are put into equivalence classes of lines that match in lines_differ.
40     Each equivalence class is represented by one of these structures,
41     but only while the classes are being computed.
42     Afterward, each class is represented by a number.  */
43  struct equivclass
44  {
45    lin next;		/* Next item in this bucket.  */
46    hash_value hash;	/* Hash of lines in this class.  */
47    char const *line;	/* A line that fits this class.  */
48    size_t length;	/* That line's length, not counting its newline.  */
49  };
50  
51  /* Hash-table: array of buckets, each being a chain of equivalence classes.
52     buckets[-1] is reserved for incomplete lines.  */
53  static lin *buckets;
54  
55  /* Number of buckets in the hash table array, not counting buckets[-1].  */
56  static size_t nbuckets;
57  
58  /* Array in which the equivalence classes are allocated.
59     The bucket-chains go through the elements in this array.
60     The number of an equivalence class is its index in this array.  */
61  static struct equivclass *equivs;
62  
63  /* Index of first free element in the array `equivs'.  */
64  static lin equivs_index;
65  
66  /* Number of elements allocated in the array `equivs'.  */
67  static lin equivs_alloc;
68  
69  /* Read a block of data into a file buffer, checking for EOF and error.  */
70  
71  void
72  file_block_read (struct file_data *current, size_t size)
73  {
74    if (size && ! current->eof)
75      {
76        size_t s = block_read (current->desc,
77  			     FILE_BUFFER (current) + current->buffered, size);
78        if (s == SIZE_MAX)
79  	pfatal_with_name (current->name);
80        current->buffered += s;
81        current->eof = s < size;
82      }
83  }
84  
85  /* Check for binary files and compare them for exact identity.  */
86  
87  /* Return 1 if BUF contains a non text character.
88     SIZE is the number of characters in BUF.  */
89  
90  #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
91  
92  /* Get ready to read the current file.
93     Return nonzero if SKIP_TEST is zero,
94     and if it appears to be a binary file.  */
95  
96  static bool
97  sip (struct file_data *current, bool skip_test)
98  {
99    /* If we have a nonexistent file at this stage, treat it as empty.  */
100    if (current->desc < 0)
101      {
102        /* Leave room for a sentinel.  */
103        current->bufsize = sizeof (word);
104        current->buffer = xmalloc (current->bufsize);
105      }
106    else
107      {
108        current->bufsize = buffer_lcm (sizeof (word),
109  				     STAT_BLOCKSIZE (current->stat),
110  				     PTRDIFF_MAX - 2 * sizeof (word));
111        current->buffer = xmalloc (current->bufsize);
112  
113        if (! skip_test)
114  	{
115  	  /* Check first part of file to see if it's a binary file.  */
116  
117  	  bool was_binary = set_binary_mode (current->desc, true);
118  	  off_t buffered;
119  	  file_block_read (current, current->bufsize);
120  	  buffered = current->buffered;
121  
122  	  if (! was_binary)
123  	    {
124  	      /* Revert to text mode and seek back to the beginning to
125  		 reread the file.  Use relative seek, since file
126  		 descriptors like stdin might not start at offset
127  		 zero.  */
128  
129  	      if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
130  		pfatal_with_name (current->name);
131  	      set_binary_mode (current->desc, false);
132  	      current->buffered = 0;
133  	      current->eof = false;
134  	    }
135  
136  	  return binary_file_p (current->buffer, buffered);
137  	}
138      }
139  
140    current->buffered = 0;
141    current->eof = false;
142    return false;
143  }
144  
145  /* Slurp the rest of the current file completely into memory.  */
146  
147  static void
148  slurp (struct file_data *current)
149  {
150    size_t cc;
151  
152    if (current->desc < 0)
153      {
154        /* The file is nonexistent.  */
155        return;
156      }
157  
158    if (S_ISREG (current->stat.st_mode))
159      {
160        /* It's a regular file; slurp in the rest all at once.  */
161  
162        /* Get the size out of the stat block.
163  	 Allocate just enough room for appended newline plus word sentinel,
164  	 plus word-alignment since we want the buffer word-aligned.  */
165        size_t file_size = current->stat.st_size;
166        cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
167        if (file_size != current->stat.st_size || cc < file_size
168  	  || PTRDIFF_MAX <= cc)
169  	xalloc_die ();
170  
171        if (current->bufsize < cc)
172  	{
173  	  current->bufsize = cc;
174  	  current->buffer = xrealloc (current->buffer, cc);
175  	}
176  
177        /* Try to read at least 1 more byte than the size indicates, to
178  	 detect whether the file is growing.  This is a nicety for
179  	 users who run 'diff' on files while they are changing.  */
180  
181        if (current->buffered <= file_size)
182  	{
183  	  file_block_read (current, file_size + 1 - current->buffered);
184  	  if (current->buffered <= file_size)
185  	    return;
186  	}
187      }
188  
189    /* It's not a regular file, or it's a growing regular file; read it,
190       growing the buffer as needed.  */
191  
192    file_block_read (current, current->bufsize - current->buffered);
193  
194    if (current->buffered)
195      {
196        while (current->buffered == current->bufsize)
197  	{
198  	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
199  	    xalloc_die ();
200  	  current->bufsize *= 2;
201  	  current->buffer = xrealloc (current->buffer, current->bufsize);
202  	  file_block_read (current, current->bufsize - current->buffered);
203  	}
204  
205        /* Allocate just enough room for appended newline plus word
206  	 sentinel, plus word-alignment.  */
207        cc = current->buffered + 2 * sizeof (word);
208        current->bufsize = cc - cc % sizeof (word);
209        current->buffer = xrealloc (current->buffer, current->bufsize);
210      }
211  }
212  
213  /* Split the file into lines, simultaneously computing the equivalence
214     class for each line.  */
215  
216  static void
217  find_and_hash_each_line (struct file_data *current)
218  {
219    hash_value h;
220    char const *p = current->prefix_end;
221    unsigned char c;
222    lin i, *bucket;
223    size_t length;
224  
225    /* Cache often-used quantities in local variables to help the compiler.  */
226    char const **linbuf = current->linbuf;
227    lin alloc_lines = current->alloc_lines;
228    lin line = 0;
229    lin linbuf_base = current->linbuf_base;
230    lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
231    struct equivclass *eqs = equivs;
232    lin eqs_index = equivs_index;
233    lin eqs_alloc = equivs_alloc;
234    char const *suffix_begin = current->suffix_begin;
235    char const *bufend = FILE_BUFFER (current) + current->buffered;
236    bool diff_length_compare_anyway =
237      ignore_white_space != IGNORE_NO_WHITE_SPACE;
238    bool same_length_diff_contents_compare_anyway =
239      diff_length_compare_anyway | ignore_case;
240  
241    while (p < suffix_begin)
242      {
243        char const *ip = p;
244  
245        h = 0;
246  
247        /* Hash this line until we find a newline.  */
248        if (ignore_case)
249  	switch (ignore_white_space)
250  	  {
251  	  case IGNORE_ALL_SPACE:
252  	    while ((c = *p++) != '\n')
253  	      if (! isspace (c))
254  		h = HASH (h, tolower (c));
255  	    break;
256  
257  	  case IGNORE_SPACE_CHANGE:
258  	    while ((c = *p++) != '\n')
259  	      {
260  		if (isspace (c))
261  		  {
262  		    do
263  		      if ((c = *p++) == '\n')
264  			goto hashing_done;
265  		    while (isspace (c));
266  
267  		    h = HASH (h, ' ');
268  		  }
269  
270  		/* C is now the first non-space.  */
271  		h = HASH (h, tolower (c));
272  	      }
273  	    break;
274  
275  	  case IGNORE_TAB_EXPANSION:
276  	    {
277  	      size_t column = 0;
278  	      while ((c = *p++) != '\n')
279  		{
280  		  size_t repetitions = 1;
281  
282  		  switch (c)
283  		    {
284  		    case '\b':
285  		      column -= 0 < column;
286  		      break;
287  
288  		    case '\t':
289  		      c = ' ';
290  		      repetitions = tabsize - column % tabsize;
291  		      column = (column + repetitions < column
292  				? 0
293  				: column + repetitions);
294  		      break;
295  
296  		    case '\r':
297  		      column = 0;
298  		      break;
299  
300  		    default:
301  		      c = tolower (c);
302  		      column++;
303  		      break;
304  		    }
305  
306  		  do
307  		    h = HASH (h, c);
308  		  while (--repetitions != 0);
309  		}
310  	    }
311  	    break;
312  
313  	  default:
314  	    while ((c = *p++) != '\n')
315  	      h = HASH (h, tolower (c));
316  	    break;
317  	  }
318        else
319  	switch (ignore_white_space)
320  	  {
321  	  case IGNORE_ALL_SPACE:
322  	    while ((c = *p++) != '\n')
323  	      if (! isspace (c))
324  		h = HASH (h, c);
325  	    break;
326  
327  	  case IGNORE_SPACE_CHANGE:
328  	    while ((c = *p++) != '\n')
329  	      {
330  		if (isspace (c))
331  		  {
332  		    do
333  		      if ((c = *p++) == '\n')
334  			goto hashing_done;
335  		    while (isspace (c));
336  
337  		    h = HASH (h, ' ');
338  		  }
339  
340  		/* C is now the first non-space.  */
341  		h = HASH (h, c);
342  	      }
343  	    break;
344  
345  	  case IGNORE_TAB_EXPANSION:
346  	    {
347  	      size_t column = 0;
348  	      while ((c = *p++) != '\n')
349  		{
350  		  size_t repetitions = 1;
351  
352  		  switch (c)
353  		    {
354  		    case '\b':
355  		      column -= 0 < column;
356  		      break;
357  
358  		    case '\t':
359  		      c = ' ';
360  		      repetitions = tabsize - column % tabsize;
361  		      column = (column + repetitions < column
362  				? 0
363  				: column + repetitions);
364  		      break;
365  
366  		    case '\r':
367  		      column = 0;
368  		      break;
369  
370  		    default:
371  		      column++;
372  		      break;
373  		    }
374  
375  		  do
376  		    h = HASH (h, c);
377  		  while (--repetitions != 0);
378  		}
379  	    }
380  	    break;
381  
382  	  default:
383  	    while ((c = *p++) != '\n')
384  	      h = HASH (h, c);
385  	    break;
386  	  }
387  
388     hashing_done:;
389  
390        bucket = &buckets[h % nbuckets];
391        length = p - ip - 1;
392  
393        if (p == bufend
394  	  && current->missing_newline
395  	  && ROBUST_OUTPUT_STYLE (output_style))
396  	{
397  	  /* This line is incomplete.  If this is significant,
398  	     put the line into buckets[-1].  */
399  	  if (ignore_white_space < IGNORE_SPACE_CHANGE)
400  	    bucket = &buckets[-1];
401  
402  	  /* Omit the inserted newline when computing linbuf later.  */
403  	  p--;
404  	  bufend = suffix_begin = p;
405  	}
406  
407        for (i = *bucket;  ;  i = eqs[i].next)
408  	if (!i)
409  	  {
410  	    /* Create a new equivalence class in this bucket.  */
411  	    i = eqs_index++;
412  	    if (i == eqs_alloc)
413  	      {
414  		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
415  		  xalloc_die ();
416  		eqs_alloc *= 2;
417  		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
418  	      }
419  	    eqs[i].next = *bucket;
420  	    eqs[i].hash = h;
421  	    eqs[i].line = ip;
422  	    eqs[i].length = length;
423  	    *bucket = i;
424  	    break;
425  	  }
426  	else if (eqs[i].hash == h)
427  	  {
428  	    char const *eqline = eqs[i].line;
429  
430  	    /* Reuse existing class if lines_differ reports the lines
431                 equal.  */
432  	    if (eqs[i].length == length)
433  	      {
434  		/* Reuse existing equivalence class if the lines are identical.
435  		   This detects the common case of exact identity
436  		   faster than lines_differ would.  */
437  		if (memcmp (eqline, ip, length) == 0)
438  		  break;
439  		if (!same_length_diff_contents_compare_anyway)
440  		  continue;
441  	      }
442  	    else if (!diff_length_compare_anyway)
443  	      continue;
444  
445  	    if (! lines_differ (eqline, ip))
446  	      break;
447  	  }
448  
449        /* Maybe increase the size of the line table.  */
450        if (line == alloc_lines)
451  	{
452  	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
453  	  if (PTRDIFF_MAX / 3 <= alloc_lines
454  	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
455  	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
456  	    xalloc_die ();
457  	  alloc_lines = 2 * alloc_lines - linbuf_base;
458  	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
459  	  linbuf += linbuf_base;
460  	  linbuf = xrealloc (linbuf,
461  			     (alloc_lines - linbuf_base) * sizeof *linbuf);
462  	  linbuf -= linbuf_base;
463  	}
464        linbuf[line] = ip;
465        cureqs[line] = i;
466        ++line;
467      }
468  
469    current->buffered_lines = line;
470  
471    for (i = 0;  ;  i++)
472      {
473        /* Record the line start for lines in the suffix that we care about.
474  	 Record one more line start than lines,
475  	 so that we can compute the length of any buffered line.  */
476        if (line == alloc_lines)
477  	{
478  	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
479  	  if (PTRDIFF_MAX / 3 <= alloc_lines
480  	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
481  	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
482  	    xalloc_die ();
483  	  alloc_lines = 2 * alloc_lines - linbuf_base;
484  	  linbuf += linbuf_base;
485  	  linbuf = xrealloc (linbuf,
486  			     (alloc_lines - linbuf_base) * sizeof *linbuf);
487  	  linbuf -= linbuf_base;
488  	}
489        linbuf[line] = p;
490  
491        if (p == bufend)
492  	break;
493  
494        if (context <= i && no_diff_means_no_output)
495  	break;
496  
497        line++;
498  
499        while (*p++ != '\n')
500  	continue;
501      }
502  
503    /* Done with cache in local variables.  */
504    current->linbuf = linbuf;
505    current->valid_lines = line;
506    current->alloc_lines = alloc_lines;
507    current->equivs = cureqs;
508    equivs = eqs;
509    equivs_alloc = eqs_alloc;
510    equivs_index = eqs_index;
511  }
512  
513  /* Prepare the text.  Make sure the text end is initialized.
514     Make sure text ends in a newline,
515     but remember that we had to add one.
516     Strip trailing CRs, if that was requested.  */
517  
518  static void
519  prepare_text (struct file_data *current)
520  {
521    size_t buffered = current->buffered;
522    char *p = FILE_BUFFER (current);
523    char *dst;
524  
525    if (buffered == 0 || p[buffered - 1] == '\n')
526      current->missing_newline = false;
527    else
528      {
529        p[buffered++] = '\n';
530        current->missing_newline = true;
531      }
532  
533    if (!p)
534      return;
535  
536    /* Don't use uninitialized storage when planting or using sentinels.  */
537    memset (p + buffered, 0, sizeof (word));
538  
539    if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
540      {
541        char const *src = dst;
542        char const *srclim = p + buffered;
543  
544        do
545  	dst += ! ((*dst = *src++) == '\r' && *src == '\n');
546        while (src < srclim);
547  
548        buffered -= src - dst;
549      }
550  
551    current->buffered = buffered;
552  }
553  
554  /* We have found N lines in a buffer of size S; guess the
555     proportionate number of lines that will be found in a buffer of
556     size T.  However, do not guess a number of lines so large that the
557     resulting line table might cause overflow in size calculations.  */
558  static lin
559  guess_lines (lin n, size_t s, size_t t)
560  {
561    size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
562    lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
563    return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
564  }
565  
566  /* Given a vector of two file_data objects, find the identical
567     prefixes and suffixes of each object.  */
568  
569  static void
570  find_identical_ends (struct file_data filevec[])
571  {
572    word *w0, *w1;
573    char *p0, *p1, *buffer0, *buffer1;
574    char const *end0, *beg0;
575    char const **linbuf0, **linbuf1;
576    lin i, lines;
577    size_t n0, n1;
578    lin alloc_lines0, alloc_lines1;
579    lin buffered_prefix, prefix_count, prefix_mask;
580    lin middle_guess, suffix_guess;
581  
582    slurp (&filevec[0]);
583    prepare_text (&filevec[0]);
584    if (filevec[0].desc != filevec[1].desc)
585      {
586        slurp (&filevec[1]);
587        prepare_text (&filevec[1]);
588      }
589    else
590      {
591        filevec[1].buffer = filevec[0].buffer;
592        filevec[1].bufsize = filevec[0].bufsize;
593        filevec[1].buffered = filevec[0].buffered;
594        filevec[1].missing_newline = filevec[0].missing_newline;
595      }
596  
597    /* Find identical prefix.  */
598  
599    w0 = filevec[0].buffer;
600    w1 = filevec[1].buffer;
601    p0 = buffer0 = (char *) w0;
602    p1 = buffer1 = (char *) w1;
603    n0 = filevec[0].buffered;
604    n1 = filevec[1].buffered;
605  
606    if (p0 == p1)
607      /* The buffers are the same; sentinels won't work.  */
608      p0 = p1 += n1;
609    else
610      {
611        /* Insert end sentinels, in this case characters that are guaranteed
612  	 to make the equality test false, and thus terminate the loop.  */
613  
614        if (n0 < n1)
615  	p0[n0] = ~p1[n0];
616        else
617  	p1[n1] = ~p0[n1];
618  
619        /* Loop until first mismatch, or to the sentinel characters.  */
620  
621        /* Compare a word at a time for speed.  */
622        while (*w0 == *w1)
623  	w0++, w1++;
624  
625        /* Do the last few bytes of comparison a byte at a time.  */
626        p0 = (char *) w0;
627        p1 = (char *) w1;
628        while (*p0 == *p1)
629  	p0++, p1++;
630  
631        /* Don't mistakenly count missing newline as part of prefix.  */
632        if (ROBUST_OUTPUT_STYLE (output_style)
633  	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
634  	      !=
635  	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
636  	p0--, p1--;
637      }
638  
639    /* Now P0 and P1 point at the first nonmatching characters.  */
640  
641    /* Skip back to last line-beginning in the prefix,
642       and then discard up to HORIZON_LINES lines from the prefix.  */
643    i = horizon_lines;
644    while (p0 != buffer0 && (p0[-1] != '\n' || i--))
645      p0--, p1--;
646  
647    /* Record the prefix.  */
648    filevec[0].prefix_end = p0;
649    filevec[1].prefix_end = p1;
650  
651    /* Find identical suffix.  */
652  
653    /* P0 and P1 point beyond the last chars not yet compared.  */
654    p0 = buffer0 + n0;
655    p1 = buffer1 + n1;
656  
657    if (! ROBUST_OUTPUT_STYLE (output_style)
658        || filevec[0].missing_newline == filevec[1].missing_newline)
659      {
660        end0 = p0;	/* Addr of last char in file 0.  */
661  
662        /* Get value of P0 at which we should stop scanning backward:
663  	 this is when either P0 or P1 points just past the last char
664  	 of the identical prefix.  */
665        beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
666  
667        /* Scan back until chars don't match or we reach that point.  */
668        for (; p0 != beg0; p0--, p1--)
669  	if (*p0 != *p1)
670  	  {
671  	    /* Point at the first char of the matching suffix.  */
672  	    beg0 = p0;
673  	    break;
674  	  }
675  
676        /* Are we at a line-beginning in both files?  If not, add the rest of
677  	 this line to the main body.  Discard up to HORIZON_LINES lines from
678  	 the identical suffix.  Also, discard one extra line,
679  	 because shift_boundaries may need it.  */
680        i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
681  			    &&
682  			    (buffer1 == p1 || p1[-1] == '\n'));
683        while (i-- && p0 != end0)
684  	while (*p0++ != '\n')
685  	  continue;
686  
687        p1 += p0 - beg0;
688      }
689  
690    /* Record the suffix.  */
691    filevec[0].suffix_begin = p0;
692    filevec[1].suffix_begin = p1;
693  
694    /* Calculate number of lines of prefix to save.
695  
696       prefix_count == 0 means save the whole prefix;
697       we need this for options like -D that output the whole file,
698       or for enormous contexts (to avoid worrying about arithmetic overflow).
699       We also need it for options like -F that output some preceding line;
700       at least we will need to find the last few lines,
701       but since we don't know how many, it's easiest to find them all.
702  
703       Otherwise, prefix_count != 0.  Save just prefix_count lines at start
704       of the line buffer; they'll be moved to the proper location later.
705       Handle 1 more line than the context says (because we count 1 too many),
706       rounded up to the next power of 2 to speed index computation.  */
707  
708    if (no_diff_means_no_output && ! function_regexp.fastmap
709        && context < LIN_MAX / 4 && context < n0)
710      {
711        middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
712        suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
713        for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
714  	continue;
715        alloc_lines0 = (prefix_count + middle_guess
716  		      + MIN (context, suffix_guess));
717      }
718    else
719      {
720        prefix_count = 0;
721        alloc_lines0 = guess_lines (0, 0, n0);
722      }
723  
724    prefix_mask = prefix_count - 1;
725    lines = 0;
726    linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
727    p0 = buffer0;
728  
729    /* If the prefix is needed, find the prefix lines.  */
730    if (! (no_diff_means_no_output
731  	 && filevec[0].prefix_end == p0
732  	 && filevec[1].prefix_end == p1))
733      {
734        end0 = filevec[0].prefix_end;
735        while (p0 != end0)
736  	{
737  	  lin l = lines++ & prefix_mask;
738  	  if (l == alloc_lines0)
739  	    {
740  	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
741  		xalloc_die ();
742  	      alloc_lines0 *= 2;
743  	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
744  	    }
745  	  linbuf0[l] = p0;
746  	  while (*p0++ != '\n')
747  	    continue;
748  	}
749      }
750    buffered_prefix = prefix_count && context < lines ? context : lines;
751  
752    /* Allocate line buffer 1.  */
753  
754    middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
755    suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
756    alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
757    if (alloc_lines1 < buffered_prefix
758        || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
759      xalloc_die ();
760    linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
761  
762    if (buffered_prefix != lines)
763      {
764        /* Rotate prefix lines to proper location.  */
765        for (i = 0;  i < buffered_prefix;  i++)
766  	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
767        for (i = 0;  i < buffered_prefix;  i++)
768  	linbuf0[i] = linbuf1[i];
769      }
770  
771    /* Initialize line buffer 1 from line buffer 0.  */
772    for (i = 0; i < buffered_prefix; i++)
773      linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
774  
775    /* Record the line buffer, adjusted so that
776       linbuf[0] points at the first differing line.  */
777    filevec[0].linbuf = linbuf0 + buffered_prefix;
778    filevec[1].linbuf = linbuf1 + buffered_prefix;
779    filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
780    filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
781    filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
782    filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
783  }
784  
785  /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
786     than 2**k.  This table is derived from Chris K. Caldwell's list
787     <http://www.utm.edu/research/primes/lists/2small/>.  */
788  
789  static unsigned char const prime_offset[] =
790  {
791    0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
792    15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
793    11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
794    55, 93, 1, 57, 25
795  };
796  
797  /* Verify that this host's size_t is not too wide for the above table.  */
798  
799  verify (enough_prime_offsets,
800  	sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
801  
802  /* Given a vector of two file_data objects, read the file associated
803     with each one, and build the table of equivalence classes.
804     Return nonzero if either file appears to be a binary file.
805     If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
806  
807  bool
808  read_files (struct file_data filevec[], bool pretend_binary)
809  {
810    int i;
811    bool skip_test = text | pretend_binary;
812    bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
813  
814    if (filevec[0].desc != filevec[1].desc)
815      appears_binary |= sip (&filevec[1], skip_test | appears_binary);
816    else
817      {
818        filevec[1].buffer = filevec[0].buffer;
819        filevec[1].bufsize = filevec[0].bufsize;
820        filevec[1].buffered = filevec[0].buffered;
821      }
822    if (appears_binary)
823      {
824        set_binary_mode (filevec[0].desc, true);
825        set_binary_mode (filevec[1].desc, true);
826        return true;
827      }
828  
829    find_identical_ends (filevec);
830  
831    equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
832    if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
833      xalloc_die ();
834    equivs = xmalloc (equivs_alloc * sizeof *equivs);
835    /* Equivalence class 0 is permanently safe for lines that were not
836       hashed.  Real equivalence classes start at 1.  */
837    equivs_index = 1;
838  
839    /* Allocate (one plus) a prime number of hash buckets.  Use a prime
840       number between 1/3 and 2/3 of the value of equiv_allocs,
841       approximately.  */
842    for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
843      continue;
844    nbuckets = ((size_t) 1 << i) - prime_offset[i];
845    if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
846      xalloc_die ();
847    buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
848    buckets++;
849  
850    for (i = 0; i < 2; i++)
851      find_and_hash_each_line (&filevec[i]);
852  
853    filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
854  
855    free (equivs);
856    free (buckets - 1);
857  
858    return false;
859  }
860