xref: /freebsd/contrib/diff/src/io.c (revision 0e8011faf58b743cc652e3b2ad0f7671227610df)
1 /* File I/O for GNU DIFF.
2 
3    Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
4    2004 Free Software Foundation, Inc.
5 
6    This file is part of GNU DIFF.
7 
8    GNU DIFF is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 2, or (at your option)
11    any later version.
12 
13    GNU DIFF is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program; see the file COPYING.
20    If not, write to the Free Software Foundation,
21    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
22 
23 #include "diff.h"
24 #include <cmpbuf.h>
25 #include <file-type.h>
26 #include <setmode.h>
27 #include <xalloc.h>
28 
29 /* Rotate an unsigned value to the left.  */
30 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
31 
32 /* Given a hash value and a new character, return a new hash value.  */
33 #define HASH(h, c) ((c) + ROL (h, 7))
34 
35 /* The type of a hash value.  */
36 typedef size_t hash_value;
37 verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
38 
39 /* Lines are put into equivalence classes of lines that match in lines_differ.
40    Each equivalence class is represented by one of these structures,
41    but only while the classes are being computed.
42    Afterward, each class is represented by a number.  */
43 struct equivclass
44 {
45   lin next;		/* Next item in this bucket.  */
46   hash_value hash;	/* Hash of lines in this class.  */
47   char const *line;	/* A line that fits this class.  */
48   size_t length;	/* That line's length, not counting its newline.  */
49 };
50 
51 /* Hash-table: array of buckets, each being a chain of equivalence classes.
52    buckets[-1] is reserved for incomplete lines.  */
53 static lin *buckets;
54 
55 /* Number of buckets in the hash table array, not counting buckets[-1].  */
56 static size_t nbuckets;
57 
58 /* Array in which the equivalence classes are allocated.
59    The bucket-chains go through the elements in this array.
60    The number of an equivalence class is its index in this array.  */
61 static struct equivclass *equivs;
62 
63 /* Index of first free element in the array `equivs'.  */
64 static lin equivs_index;
65 
66 /* Number of elements allocated in the array `equivs'.  */
67 static lin equivs_alloc;
68 
69 /* Read a block of data into a file buffer, checking for EOF and error.  */
70 
71 void
72 file_block_read (struct file_data *current, size_t size)
73 {
74   if (size && ! current->eof)
75     {
76       size_t s = block_read (current->desc,
77 			     FILE_BUFFER (current) + current->buffered, size);
78       if (s == SIZE_MAX)
79 	pfatal_with_name (current->name);
80       current->buffered += s;
81       current->eof = s < size;
82     }
83 }
84 
85 /* Check for binary files and compare them for exact identity.  */
86 
87 /* Return 1 if BUF contains a non text character.
88    SIZE is the number of characters in BUF.  */
89 
90 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
91 
92 /* Get ready to read the current file.
93    Return nonzero if SKIP_TEST is zero,
94    and if it appears to be a binary file.  */
95 
96 static bool
97 sip (struct file_data *current, bool skip_test)
98 {
99   /* If we have a nonexistent file at this stage, treat it as empty.  */
100   if (current->desc < 0)
101     {
102       /* Leave room for a sentinel.  */
103       current->bufsize = sizeof (word);
104       current->buffer = xmalloc (current->bufsize);
105     }
106   else
107     {
108       current->bufsize = buffer_lcm (sizeof (word),
109 				     STAT_BLOCKSIZE (current->stat),
110 				     PTRDIFF_MAX - 2 * sizeof (word));
111       current->buffer = xmalloc (current->bufsize);
112 
113       if (! skip_test)
114 	{
115 	  /* Check first part of file to see if it's a binary file.  */
116 
117 	  bool was_binary = set_binary_mode (current->desc, true);
118 	  off_t buffered;
119 	  file_block_read (current, current->bufsize);
120 	  buffered = current->buffered;
121 
122 	  if (! was_binary)
123 	    {
124 	      /* Revert to text mode and seek back to the beginning to
125 		 reread the file.  Use relative seek, since file
126 		 descriptors like stdin might not start at offset
127 		 zero.  */
128 
129 	      if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
130 		pfatal_with_name (current->name);
131 	      set_binary_mode (current->desc, false);
132 	      current->buffered = 0;
133 	      current->eof = false;
134 	    }
135 
136 	  return binary_file_p (current->buffer, buffered);
137 	}
138     }
139 
140   current->buffered = 0;
141   current->eof = false;
142   return false;
143 }
144 
145 /* Slurp the rest of the current file completely into memory.  */
146 
147 static void
148 slurp (struct file_data *current)
149 {
150   size_t cc;
151 
152   if (current->desc < 0)
153     {
154       /* The file is nonexistent.  */
155       return;
156     }
157 
158   if (S_ISREG (current->stat.st_mode))
159     {
160       /* It's a regular file; slurp in the rest all at once.  */
161 
162       /* Get the size out of the stat block.
163 	 Allocate just enough room for appended newline plus word sentinel,
164 	 plus word-alignment since we want the buffer word-aligned.  */
165       size_t file_size = current->stat.st_size;
166       cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
167       if (file_size != current->stat.st_size || cc < file_size
168 	  || PTRDIFF_MAX <= cc)
169 	xalloc_die ();
170 
171       if (current->bufsize < cc)
172 	{
173 	  current->bufsize = cc;
174 	  current->buffer = xrealloc (current->buffer, cc);
175 	}
176 
177       /* Try to read at least 1 more byte than the size indicates, to
178 	 detect whether the file is growing.  This is a nicety for
179 	 users who run 'diff' on files while they are changing.  */
180 
181       if (current->buffered <= file_size)
182 	{
183 	  file_block_read (current, file_size + 1 - current->buffered);
184 	  if (current->buffered <= file_size)
185 	    return;
186 	}
187     }
188 
189   /* It's not a regular file, or it's a growing regular file; read it,
190      growing the buffer as needed.  */
191 
192   file_block_read (current, current->bufsize - current->buffered);
193 
194   if (current->buffered)
195     {
196       while (current->buffered == current->bufsize)
197 	{
198 	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
199 	    xalloc_die ();
200 	  current->bufsize *= 2;
201 	  current->buffer = xrealloc (current->buffer, current->bufsize);
202 	  file_block_read (current, current->bufsize - current->buffered);
203 	}
204 
205       /* Allocate just enough room for appended newline plus word
206 	 sentinel, plus word-alignment.  */
207       cc = current->buffered + 2 * sizeof (word);
208       current->bufsize = cc - cc % sizeof (word);
209       current->buffer = xrealloc (current->buffer, current->bufsize);
210     }
211 }
212 
213 /* Split the file into lines, simultaneously computing the equivalence
214    class for each line.  */
215 
216 static void
217 find_and_hash_each_line (struct file_data *current)
218 {
219   hash_value h;
220   char const *p = current->prefix_end;
221   unsigned char c;
222   lin i, *bucket;
223   size_t length;
224 
225   /* Cache often-used quantities in local variables to help the compiler.  */
226   char const **linbuf = current->linbuf;
227   lin alloc_lines = current->alloc_lines;
228   lin line = 0;
229   lin linbuf_base = current->linbuf_base;
230   lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
231   struct equivclass *eqs = equivs;
232   lin eqs_index = equivs_index;
233   lin eqs_alloc = equivs_alloc;
234   char const *suffix_begin = current->suffix_begin;
235   char const *bufend = FILE_BUFFER (current) + current->buffered;
236   bool diff_length_compare_anyway =
237     ignore_white_space != IGNORE_NO_WHITE_SPACE;
238   bool same_length_diff_contents_compare_anyway =
239     diff_length_compare_anyway | ignore_case;
240 
241   while (p < suffix_begin)
242     {
243       char const *ip = p;
244 
245       h = 0;
246 
247       /* Hash this line until we find a newline.  */
248       if (ignore_case)
249 	switch (ignore_white_space)
250 	  {
251 	  case IGNORE_ALL_SPACE:
252 	    while ((c = *p++) != '\n')
253 	      if (! isspace (c))
254 		h = HASH (h, tolower (c));
255 	    break;
256 
257 	  case IGNORE_SPACE_CHANGE:
258 	    while ((c = *p++) != '\n')
259 	      {
260 		if (isspace (c))
261 		  {
262 		    do
263 		      if ((c = *p++) == '\n')
264 			goto hashing_done;
265 		    while (isspace (c));
266 
267 		    h = HASH (h, ' ');
268 		  }
269 
270 		/* C is now the first non-space.  */
271 		h = HASH (h, tolower (c));
272 	      }
273 	    break;
274 
275 	  case IGNORE_TAB_EXPANSION:
276 	    {
277 	      size_t column = 0;
278 	      while ((c = *p++) != '\n')
279 		{
280 		  size_t repetitions = 1;
281 
282 		  switch (c)
283 		    {
284 		    case '\b':
285 		      column -= 0 < column;
286 		      break;
287 
288 		    case '\t':
289 		      c = ' ';
290 		      repetitions = tabsize - column % tabsize;
291 		      column = (column + repetitions < column
292 				? 0
293 				: column + repetitions);
294 		      break;
295 
296 		    case '\r':
297 		      column = 0;
298 		      break;
299 
300 		    default:
301 		      c = tolower (c);
302 		      column++;
303 		      break;
304 		    }
305 
306 		  do
307 		    h = HASH (h, c);
308 		  while (--repetitions != 0);
309 		}
310 	    }
311 	    break;
312 
313 	  default:
314 	    while ((c = *p++) != '\n')
315 	      h = HASH (h, tolower (c));
316 	    break;
317 	  }
318       else
319 	switch (ignore_white_space)
320 	  {
321 	  case IGNORE_ALL_SPACE:
322 	    while ((c = *p++) != '\n')
323 	      if (! isspace (c))
324 		h = HASH (h, c);
325 	    break;
326 
327 	  case IGNORE_SPACE_CHANGE:
328 	    while ((c = *p++) != '\n')
329 	      {
330 		if (isspace (c))
331 		  {
332 		    do
333 		      if ((c = *p++) == '\n')
334 			goto hashing_done;
335 		    while (isspace (c));
336 
337 		    h = HASH (h, ' ');
338 		  }
339 
340 		/* C is now the first non-space.  */
341 		h = HASH (h, c);
342 	      }
343 	    break;
344 
345 	  case IGNORE_TAB_EXPANSION:
346 	    {
347 	      size_t column = 0;
348 	      while ((c = *p++) != '\n')
349 		{
350 		  size_t repetitions = 1;
351 
352 		  switch (c)
353 		    {
354 		    case '\b':
355 		      column -= 0 < column;
356 		      break;
357 
358 		    case '\t':
359 		      c = ' ';
360 		      repetitions = tabsize - column % tabsize;
361 		      column = (column + repetitions < column
362 				? 0
363 				: column + repetitions);
364 		      break;
365 
366 		    case '\r':
367 		      column = 0;
368 		      break;
369 
370 		    default:
371 		      column++;
372 		      break;
373 		    }
374 
375 		  do
376 		    h = HASH (h, c);
377 		  while (--repetitions != 0);
378 		}
379 	    }
380 	    break;
381 
382 	  default:
383 	    while ((c = *p++) != '\n')
384 	      h = HASH (h, c);
385 	    break;
386 	  }
387 
388    hashing_done:;
389 
390       bucket = &buckets[h % nbuckets];
391       length = p - ip - 1;
392 
393       if (p == bufend
394 	  && current->missing_newline
395 	  && ROBUST_OUTPUT_STYLE (output_style))
396 	{
397 	  /* This line is incomplete.  If this is significant,
398 	     put the line into buckets[-1].  */
399 	  if (ignore_white_space < IGNORE_SPACE_CHANGE)
400 	    bucket = &buckets[-1];
401 
402 	  /* Omit the inserted newline when computing linbuf later.  */
403 	  p--;
404 	  bufend = suffix_begin = p;
405 	}
406 
407       for (i = *bucket;  ;  i = eqs[i].next)
408 	if (!i)
409 	  {
410 	    /* Create a new equivalence class in this bucket.  */
411 	    i = eqs_index++;
412 	    if (i == eqs_alloc)
413 	      {
414 		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
415 		  xalloc_die ();
416 		eqs_alloc *= 2;
417 		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
418 	      }
419 	    eqs[i].next = *bucket;
420 	    eqs[i].hash = h;
421 	    eqs[i].line = ip;
422 	    eqs[i].length = length;
423 	    *bucket = i;
424 	    break;
425 	  }
426 	else if (eqs[i].hash == h)
427 	  {
428 	    char const *eqline = eqs[i].line;
429 
430 	    /* Reuse existing class if lines_differ reports the lines
431                equal.  */
432 	    if (eqs[i].length == length)
433 	      {
434 		/* Reuse existing equivalence class if the lines are identical.
435 		   This detects the common case of exact identity
436 		   faster than lines_differ would.  */
437 		if (memcmp (eqline, ip, length) == 0)
438 		  break;
439 		if (!same_length_diff_contents_compare_anyway)
440 		  continue;
441 	      }
442 	    else if (!diff_length_compare_anyway)
443 	      continue;
444 
445 	    if (! lines_differ (eqline, ip))
446 	      break;
447 	  }
448 
449       /* Maybe increase the size of the line table.  */
450       if (line == alloc_lines)
451 	{
452 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
453 	  if (PTRDIFF_MAX / 3 <= alloc_lines
454 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
455 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
456 	    xalloc_die ();
457 	  alloc_lines = 2 * alloc_lines - linbuf_base;
458 	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
459 	  linbuf += linbuf_base;
460 	  linbuf = xrealloc (linbuf,
461 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
462 	  linbuf -= linbuf_base;
463 	}
464       linbuf[line] = ip;
465       cureqs[line] = i;
466       ++line;
467     }
468 
469   current->buffered_lines = line;
470 
471   for (i = 0;  ;  i++)
472     {
473       /* Record the line start for lines in the suffix that we care about.
474 	 Record one more line start than lines,
475 	 so that we can compute the length of any buffered line.  */
476       if (line == alloc_lines)
477 	{
478 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
479 	  if (PTRDIFF_MAX / 3 <= alloc_lines
480 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
481 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
482 	    xalloc_die ();
483 	  alloc_lines = 2 * alloc_lines - linbuf_base;
484 	  linbuf += linbuf_base;
485 	  linbuf = xrealloc (linbuf,
486 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
487 	  linbuf -= linbuf_base;
488 	}
489       linbuf[line] = p;
490 
491       if (p == bufend)
492 	break;
493 
494       if (context <= i && no_diff_means_no_output)
495 	break;
496 
497       line++;
498 
499       while (*p++ != '\n')
500 	continue;
501     }
502 
503   /* Done with cache in local variables.  */
504   current->linbuf = linbuf;
505   current->valid_lines = line;
506   current->alloc_lines = alloc_lines;
507   current->equivs = cureqs;
508   equivs = eqs;
509   equivs_alloc = eqs_alloc;
510   equivs_index = eqs_index;
511 }
512 
513 /* Prepare the text.  Make sure the text end is initialized.
514    Make sure text ends in a newline,
515    but remember that we had to add one.
516    Strip trailing CRs, if that was requested.  */
517 
518 static void
519 prepare_text (struct file_data *current)
520 {
521   size_t buffered = current->buffered;
522   char *p = FILE_BUFFER (current);
523   char *dst;
524 
525   if (buffered == 0 || p[buffered - 1] == '\n')
526     current->missing_newline = false;
527   else
528     {
529       p[buffered++] = '\n';
530       current->missing_newline = true;
531     }
532 
533   if (!p)
534     return;
535 
536   /* Don't use uninitialized storage when planting or using sentinels.  */
537   memset (p + buffered, 0, sizeof (word));
538 
539   if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
540     {
541       char const *src = dst;
542       char const *srclim = p + buffered;
543 
544       do
545 	dst += ! ((*dst = *src++) == '\r' && *src == '\n');
546       while (src < srclim);
547 
548       buffered -= src - dst;
549     }
550 
551   current->buffered = buffered;
552 }
553 
554 /* We have found N lines in a buffer of size S; guess the
555    proportionate number of lines that will be found in a buffer of
556    size T.  However, do not guess a number of lines so large that the
557    resulting line table might cause overflow in size calculations.  */
558 static lin
559 guess_lines (lin n, size_t s, size_t t)
560 {
561   size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
562   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
563   return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
564 }
565 
566 /* Given a vector of two file_data objects, find the identical
567    prefixes and suffixes of each object.  */
568 
569 static void
570 find_identical_ends (struct file_data filevec[])
571 {
572   word *w0, *w1;
573   char *p0, *p1, *buffer0, *buffer1;
574   char const *end0, *beg0;
575   char const **linbuf0, **linbuf1;
576   lin i, lines;
577   size_t n0, n1;
578   lin alloc_lines0, alloc_lines1;
579   lin buffered_prefix, prefix_count, prefix_mask;
580   lin middle_guess, suffix_guess;
581 
582   slurp (&filevec[0]);
583   prepare_text (&filevec[0]);
584   if (filevec[0].desc != filevec[1].desc)
585     {
586       slurp (&filevec[1]);
587       prepare_text (&filevec[1]);
588     }
589   else
590     {
591       filevec[1].buffer = filevec[0].buffer;
592       filevec[1].bufsize = filevec[0].bufsize;
593       filevec[1].buffered = filevec[0].buffered;
594       filevec[1].missing_newline = filevec[0].missing_newline;
595     }
596 
597   /* Find identical prefix.  */
598 
599   w0 = filevec[0].buffer;
600   w1 = filevec[1].buffer;
601   p0 = buffer0 = (char *) w0;
602   p1 = buffer1 = (char *) w1;
603   n0 = filevec[0].buffered;
604   n1 = filevec[1].buffered;
605 
606   if (p0 == p1)
607     /* The buffers are the same; sentinels won't work.  */
608     p0 = p1 += n1;
609   else
610     {
611       /* Insert end sentinels, in this case characters that are guaranteed
612 	 to make the equality test false, and thus terminate the loop.  */
613 
614       if (n0 < n1)
615 	p0[n0] = ~p1[n0];
616       else
617 	p1[n1] = ~p0[n1];
618 
619       /* Loop until first mismatch, or to the sentinel characters.  */
620 
621       /* Compare a word at a time for speed.  */
622       while (*w0 == *w1)
623 	w0++, w1++;
624 
625       /* Do the last few bytes of comparison a byte at a time.  */
626       p0 = (char *) w0;
627       p1 = (char *) w1;
628       while (*p0 == *p1)
629 	p0++, p1++;
630 
631       /* Don't mistakenly count missing newline as part of prefix.  */
632       if (ROBUST_OUTPUT_STYLE (output_style)
633 	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
634 	      !=
635 	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
636 	p0--, p1--;
637     }
638 
639   /* Now P0 and P1 point at the first nonmatching characters.  */
640 
641   /* Skip back to last line-beginning in the prefix,
642      and then discard up to HORIZON_LINES lines from the prefix.  */
643   i = horizon_lines;
644   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
645     p0--, p1--;
646 
647   /* Record the prefix.  */
648   filevec[0].prefix_end = p0;
649   filevec[1].prefix_end = p1;
650 
651   /* Find identical suffix.  */
652 
653   /* P0 and P1 point beyond the last chars not yet compared.  */
654   p0 = buffer0 + n0;
655   p1 = buffer1 + n1;
656 
657   if (! ROBUST_OUTPUT_STYLE (output_style)
658       || filevec[0].missing_newline == filevec[1].missing_newline)
659     {
660       end0 = p0;	/* Addr of last char in file 0.  */
661 
662       /* Get value of P0 at which we should stop scanning backward:
663 	 this is when either P0 or P1 points just past the last char
664 	 of the identical prefix.  */
665       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
666 
667       /* Scan back until chars don't match or we reach that point.  */
668       for (; p0 != beg0; p0--, p1--)
669 	if (*p0 != *p1)
670 	  {
671 	    /* Point at the first char of the matching suffix.  */
672 	    beg0 = p0;
673 	    break;
674 	  }
675 
676       /* Are we at a line-beginning in both files?  If not, add the rest of
677 	 this line to the main body.  Discard up to HORIZON_LINES lines from
678 	 the identical suffix.  Also, discard one extra line,
679 	 because shift_boundaries may need it.  */
680       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
681 			    &&
682 			    (buffer1 == p1 || p1[-1] == '\n'));
683       while (i-- && p0 != end0)
684 	while (*p0++ != '\n')
685 	  continue;
686 
687       p1 += p0 - beg0;
688     }
689 
690   /* Record the suffix.  */
691   filevec[0].suffix_begin = p0;
692   filevec[1].suffix_begin = p1;
693 
694   /* Calculate number of lines of prefix to save.
695 
696      prefix_count == 0 means save the whole prefix;
697      we need this for options like -D that output the whole file,
698      or for enormous contexts (to avoid worrying about arithmetic overflow).
699      We also need it for options like -F that output some preceding line;
700      at least we will need to find the last few lines,
701      but since we don't know how many, it's easiest to find them all.
702 
703      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
704      of the line buffer; they'll be moved to the proper location later.
705      Handle 1 more line than the context says (because we count 1 too many),
706      rounded up to the next power of 2 to speed index computation.  */
707 
708   if (no_diff_means_no_output && ! function_regexp.fastmap
709       && context < LIN_MAX / 4 && context < n0)
710     {
711       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
712       suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
713       for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
714 	continue;
715       alloc_lines0 = (prefix_count + middle_guess
716 		      + MIN (context, suffix_guess));
717     }
718   else
719     {
720       prefix_count = 0;
721       alloc_lines0 = guess_lines (0, 0, n0);
722     }
723 
724   prefix_mask = prefix_count - 1;
725   lines = 0;
726   linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
727   p0 = buffer0;
728 
729   /* If the prefix is needed, find the prefix lines.  */
730   if (! (no_diff_means_no_output
731 	 && filevec[0].prefix_end == p0
732 	 && filevec[1].prefix_end == p1))
733     {
734       end0 = filevec[0].prefix_end;
735       while (p0 != end0)
736 	{
737 	  lin l = lines++ & prefix_mask;
738 	  if (l == alloc_lines0)
739 	    {
740 	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
741 		xalloc_die ();
742 	      alloc_lines0 *= 2;
743 	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
744 	    }
745 	  linbuf0[l] = p0;
746 	  while (*p0++ != '\n')
747 	    continue;
748 	}
749     }
750   buffered_prefix = prefix_count && context < lines ? context : lines;
751 
752   /* Allocate line buffer 1.  */
753 
754   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
755   suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
756   alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
757   if (alloc_lines1 < buffered_prefix
758       || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
759     xalloc_die ();
760   linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
761 
762   if (buffered_prefix != lines)
763     {
764       /* Rotate prefix lines to proper location.  */
765       for (i = 0;  i < buffered_prefix;  i++)
766 	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
767       for (i = 0;  i < buffered_prefix;  i++)
768 	linbuf0[i] = linbuf1[i];
769     }
770 
771   /* Initialize line buffer 1 from line buffer 0.  */
772   for (i = 0; i < buffered_prefix; i++)
773     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
774 
775   /* Record the line buffer, adjusted so that
776      linbuf[0] points at the first differing line.  */
777   filevec[0].linbuf = linbuf0 + buffered_prefix;
778   filevec[1].linbuf = linbuf1 + buffered_prefix;
779   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
780   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
781   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
782   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
783 }
784 
785 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
786    than 2**k.  This table is derived from Chris K. Caldwell's list
787    <http://www.utm.edu/research/primes/lists/2small/>.  */
788 
789 static unsigned char const prime_offset[] =
790 {
791   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
792   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
793   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
794   55, 93, 1, 57, 25
795 };
796 
797 /* Verify that this host's size_t is not too wide for the above table.  */
798 
799 verify (enough_prime_offsets,
800 	sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
801 
802 /* Given a vector of two file_data objects, read the file associated
803    with each one, and build the table of equivalence classes.
804    Return nonzero if either file appears to be a binary file.
805    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
806 
807 bool
808 read_files (struct file_data filevec[], bool pretend_binary)
809 {
810   int i;
811   bool skip_test = text | pretend_binary;
812   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
813 
814   if (filevec[0].desc != filevec[1].desc)
815     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
816   else
817     {
818       filevec[1].buffer = filevec[0].buffer;
819       filevec[1].bufsize = filevec[0].bufsize;
820       filevec[1].buffered = filevec[0].buffered;
821     }
822   if (appears_binary)
823     {
824       set_binary_mode (filevec[0].desc, true);
825       set_binary_mode (filevec[1].desc, true);
826       return true;
827     }
828 
829   find_identical_ends (filevec);
830 
831   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
832   if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
833     xalloc_die ();
834   equivs = xmalloc (equivs_alloc * sizeof *equivs);
835   /* Equivalence class 0 is permanently safe for lines that were not
836      hashed.  Real equivalence classes start at 1.  */
837   equivs_index = 1;
838 
839   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
840      number between 1/3 and 2/3 of the value of equiv_allocs,
841      approximately.  */
842   for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
843     continue;
844   nbuckets = ((size_t) 1 << i) - prime_offset[i];
845   if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
846     xalloc_die ();
847   buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
848   buckets++;
849 
850   for (i = 0; i < 2; i++)
851     find_and_hash_each_line (&filevec[i]);
852 
853   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
854 
855   free (equivs);
856   free (buckets - 1);
857 
858   return false;
859 }
860