xref: /freebsd/contrib/diff/src/io.c (revision 18fd37a72c3a7549d2d4f6c6ea00bdcd2bdaca01)
118fd37a7SXin LI /* File I/O for GNU DIFF.
218fd37a7SXin LI 
318fd37a7SXin LI    Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
418fd37a7SXin LI    2004 Free Software Foundation, Inc.
518fd37a7SXin LI 
618fd37a7SXin LI    This file is part of GNU DIFF.
718fd37a7SXin LI 
818fd37a7SXin LI    GNU DIFF is free software; you can redistribute it and/or modify
918fd37a7SXin LI    it under the terms of the GNU General Public License as published by
1018fd37a7SXin LI    the Free Software Foundation; either version 2, or (at your option)
1118fd37a7SXin LI    any later version.
1218fd37a7SXin LI 
1318fd37a7SXin LI    GNU DIFF is distributed in the hope that it will be useful,
1418fd37a7SXin LI    but WITHOUT ANY WARRANTY; without even the implied warranty of
1518fd37a7SXin LI    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1618fd37a7SXin LI    GNU General Public License for more details.
1718fd37a7SXin LI 
1818fd37a7SXin LI    You should have received a copy of the GNU General Public License
1918fd37a7SXin LI    along with this program; see the file COPYING.
2018fd37a7SXin LI    If not, write to the Free Software Foundation,
2118fd37a7SXin LI    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
2218fd37a7SXin LI 
2318fd37a7SXin LI #include "diff.h"
2418fd37a7SXin LI #include <cmpbuf.h>
2518fd37a7SXin LI #include <file-type.h>
2618fd37a7SXin LI #include <setmode.h>
2718fd37a7SXin LI #include <xalloc.h>
2818fd37a7SXin LI 
2918fd37a7SXin LI /* Rotate an unsigned value to the left.  */
3018fd37a7SXin LI #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
3118fd37a7SXin LI 
3218fd37a7SXin LI /* Given a hash value and a new character, return a new hash value.  */
3318fd37a7SXin LI #define HASH(h, c) ((c) + ROL (h, 7))
3418fd37a7SXin LI 
3518fd37a7SXin LI /* The type of a hash value.  */
3618fd37a7SXin LI typedef size_t hash_value;
3718fd37a7SXin LI verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
3818fd37a7SXin LI 
3918fd37a7SXin LI /* Lines are put into equivalence classes of lines that match in lines_differ.
4018fd37a7SXin LI    Each equivalence class is represented by one of these structures,
4118fd37a7SXin LI    but only while the classes are being computed.
4218fd37a7SXin LI    Afterward, each class is represented by a number.  */
4318fd37a7SXin LI struct equivclass
4418fd37a7SXin LI {
4518fd37a7SXin LI   lin next;		/* Next item in this bucket.  */
4618fd37a7SXin LI   hash_value hash;	/* Hash of lines in this class.  */
4718fd37a7SXin LI   char const *line;	/* A line that fits this class.  */
4818fd37a7SXin LI   size_t length;	/* That line's length, not counting its newline.  */
4918fd37a7SXin LI };
5018fd37a7SXin LI 
5118fd37a7SXin LI /* Hash-table: array of buckets, each being a chain of equivalence classes.
5218fd37a7SXin LI    buckets[-1] is reserved for incomplete lines.  */
5318fd37a7SXin LI static lin *buckets;
5418fd37a7SXin LI 
5518fd37a7SXin LI /* Number of buckets in the hash table array, not counting buckets[-1].  */
5618fd37a7SXin LI static size_t nbuckets;
5718fd37a7SXin LI 
5818fd37a7SXin LI /* Array in which the equivalence classes are allocated.
5918fd37a7SXin LI    The bucket-chains go through the elements in this array.
6018fd37a7SXin LI    The number of an equivalence class is its index in this array.  */
6118fd37a7SXin LI static struct equivclass *equivs;
6218fd37a7SXin LI 
6318fd37a7SXin LI /* Index of first free element in the array `equivs'.  */
6418fd37a7SXin LI static lin equivs_index;
6518fd37a7SXin LI 
6618fd37a7SXin LI /* Number of elements allocated in the array `equivs'.  */
6718fd37a7SXin LI static lin equivs_alloc;
6818fd37a7SXin LI 
6918fd37a7SXin LI /* Read a block of data into a file buffer, checking for EOF and error.  */
7018fd37a7SXin LI 
7118fd37a7SXin LI void
file_block_read(struct file_data * current,size_t size)7218fd37a7SXin LI file_block_read (struct file_data *current, size_t size)
7318fd37a7SXin LI {
7418fd37a7SXin LI   if (size && ! current->eof)
7518fd37a7SXin LI     {
7618fd37a7SXin LI       size_t s = block_read (current->desc,
7718fd37a7SXin LI 			     FILE_BUFFER (current) + current->buffered, size);
7818fd37a7SXin LI       if (s == SIZE_MAX)
7918fd37a7SXin LI 	pfatal_with_name (current->name);
8018fd37a7SXin LI       current->buffered += s;
8118fd37a7SXin LI       current->eof = s < size;
8218fd37a7SXin LI     }
8318fd37a7SXin LI }
8418fd37a7SXin LI 
8518fd37a7SXin LI /* Check for binary files and compare them for exact identity.  */
8618fd37a7SXin LI 
8718fd37a7SXin LI /* Return 1 if BUF contains a non text character.
8818fd37a7SXin LI    SIZE is the number of characters in BUF.  */
8918fd37a7SXin LI 
9018fd37a7SXin LI #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
9118fd37a7SXin LI 
9218fd37a7SXin LI /* Get ready to read the current file.
9318fd37a7SXin LI    Return nonzero if SKIP_TEST is zero,
9418fd37a7SXin LI    and if it appears to be a binary file.  */
9518fd37a7SXin LI 
9618fd37a7SXin LI static bool
sip(struct file_data * current,bool skip_test)9718fd37a7SXin LI sip (struct file_data *current, bool skip_test)
9818fd37a7SXin LI {
9918fd37a7SXin LI   /* If we have a nonexistent file at this stage, treat it as empty.  */
10018fd37a7SXin LI   if (current->desc < 0)
10118fd37a7SXin LI     {
10218fd37a7SXin LI       /* Leave room for a sentinel.  */
10318fd37a7SXin LI       current->bufsize = sizeof (word);
10418fd37a7SXin LI       current->buffer = xmalloc (current->bufsize);
10518fd37a7SXin LI     }
10618fd37a7SXin LI   else
10718fd37a7SXin LI     {
10818fd37a7SXin LI       current->bufsize = buffer_lcm (sizeof (word),
10918fd37a7SXin LI 				     STAT_BLOCKSIZE (current->stat),
11018fd37a7SXin LI 				     PTRDIFF_MAX - 2 * sizeof (word));
11118fd37a7SXin LI       current->buffer = xmalloc (current->bufsize);
11218fd37a7SXin LI 
11318fd37a7SXin LI       if (! skip_test)
11418fd37a7SXin LI 	{
11518fd37a7SXin LI 	  /* Check first part of file to see if it's a binary file.  */
11618fd37a7SXin LI 
11718fd37a7SXin LI 	  bool was_binary = set_binary_mode (current->desc, true);
11818fd37a7SXin LI 	  off_t buffered;
11918fd37a7SXin LI 	  file_block_read (current, current->bufsize);
12018fd37a7SXin LI 	  buffered = current->buffered;
12118fd37a7SXin LI 
12218fd37a7SXin LI 	  if (! was_binary)
12318fd37a7SXin LI 	    {
12418fd37a7SXin LI 	      /* Revert to text mode and seek back to the beginning to
12518fd37a7SXin LI 		 reread the file.  Use relative seek, since file
12618fd37a7SXin LI 		 descriptors like stdin might not start at offset
12718fd37a7SXin LI 		 zero.  */
12818fd37a7SXin LI 
12918fd37a7SXin LI 	      if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
13018fd37a7SXin LI 		pfatal_with_name (current->name);
13118fd37a7SXin LI 	      set_binary_mode (current->desc, false);
13218fd37a7SXin LI 	      current->buffered = 0;
13318fd37a7SXin LI 	      current->eof = false;
13418fd37a7SXin LI 	    }
13518fd37a7SXin LI 
13618fd37a7SXin LI 	  return binary_file_p (current->buffer, buffered);
13718fd37a7SXin LI 	}
13818fd37a7SXin LI     }
13918fd37a7SXin LI 
14018fd37a7SXin LI   current->buffered = 0;
14118fd37a7SXin LI   current->eof = false;
14218fd37a7SXin LI   return false;
14318fd37a7SXin LI }
14418fd37a7SXin LI 
14518fd37a7SXin LI /* Slurp the rest of the current file completely into memory.  */
14618fd37a7SXin LI 
14718fd37a7SXin LI static void
slurp(struct file_data * current)14818fd37a7SXin LI slurp (struct file_data *current)
14918fd37a7SXin LI {
15018fd37a7SXin LI   size_t cc;
15118fd37a7SXin LI 
15218fd37a7SXin LI   if (current->desc < 0)
15318fd37a7SXin LI     {
15418fd37a7SXin LI       /* The file is nonexistent.  */
15518fd37a7SXin LI       return;
15618fd37a7SXin LI     }
15718fd37a7SXin LI 
15818fd37a7SXin LI   if (S_ISREG (current->stat.st_mode))
15918fd37a7SXin LI     {
16018fd37a7SXin LI       /* It's a regular file; slurp in the rest all at once.  */
16118fd37a7SXin LI 
16218fd37a7SXin LI       /* Get the size out of the stat block.
16318fd37a7SXin LI 	 Allocate just enough room for appended newline plus word sentinel,
16418fd37a7SXin LI 	 plus word-alignment since we want the buffer word-aligned.  */
16518fd37a7SXin LI       size_t file_size = current->stat.st_size;
16618fd37a7SXin LI       cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
16718fd37a7SXin LI       if (file_size != current->stat.st_size || cc < file_size
16818fd37a7SXin LI 	  || PTRDIFF_MAX <= cc)
16918fd37a7SXin LI 	xalloc_die ();
17018fd37a7SXin LI 
17118fd37a7SXin LI       if (current->bufsize < cc)
17218fd37a7SXin LI 	{
17318fd37a7SXin LI 	  current->bufsize = cc;
17418fd37a7SXin LI 	  current->buffer = xrealloc (current->buffer, cc);
17518fd37a7SXin LI 	}
17618fd37a7SXin LI 
17718fd37a7SXin LI       /* Try to read at least 1 more byte than the size indicates, to
17818fd37a7SXin LI 	 detect whether the file is growing.  This is a nicety for
17918fd37a7SXin LI 	 users who run 'diff' on files while they are changing.  */
18018fd37a7SXin LI 
18118fd37a7SXin LI       if (current->buffered <= file_size)
18218fd37a7SXin LI 	{
18318fd37a7SXin LI 	  file_block_read (current, file_size + 1 - current->buffered);
18418fd37a7SXin LI 	  if (current->buffered <= file_size)
18518fd37a7SXin LI 	    return;
18618fd37a7SXin LI 	}
18718fd37a7SXin LI     }
18818fd37a7SXin LI 
18918fd37a7SXin LI   /* It's not a regular file, or it's a growing regular file; read it,
19018fd37a7SXin LI      growing the buffer as needed.  */
19118fd37a7SXin LI 
19218fd37a7SXin LI   file_block_read (current, current->bufsize - current->buffered);
19318fd37a7SXin LI 
19418fd37a7SXin LI   if (current->buffered)
19518fd37a7SXin LI     {
19618fd37a7SXin LI       while (current->buffered == current->bufsize)
19718fd37a7SXin LI 	{
19818fd37a7SXin LI 	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
19918fd37a7SXin LI 	    xalloc_die ();
20018fd37a7SXin LI 	  current->bufsize *= 2;
20118fd37a7SXin LI 	  current->buffer = xrealloc (current->buffer, current->bufsize);
20218fd37a7SXin LI 	  file_block_read (current, current->bufsize - current->buffered);
20318fd37a7SXin LI 	}
20418fd37a7SXin LI 
20518fd37a7SXin LI       /* Allocate just enough room for appended newline plus word
20618fd37a7SXin LI 	 sentinel, plus word-alignment.  */
20718fd37a7SXin LI       cc = current->buffered + 2 * sizeof (word);
20818fd37a7SXin LI       current->bufsize = cc - cc % sizeof (word);
20918fd37a7SXin LI       current->buffer = xrealloc (current->buffer, current->bufsize);
21018fd37a7SXin LI     }
21118fd37a7SXin LI }
21218fd37a7SXin LI 
21318fd37a7SXin LI /* Split the file into lines, simultaneously computing the equivalence
21418fd37a7SXin LI    class for each line.  */
21518fd37a7SXin LI 
21618fd37a7SXin LI static void
find_and_hash_each_line(struct file_data * current)21718fd37a7SXin LI find_and_hash_each_line (struct file_data *current)
21818fd37a7SXin LI {
21918fd37a7SXin LI   hash_value h;
22018fd37a7SXin LI   char const *p = current->prefix_end;
22118fd37a7SXin LI   unsigned char c;
22218fd37a7SXin LI   lin i, *bucket;
22318fd37a7SXin LI   size_t length;
22418fd37a7SXin LI 
22518fd37a7SXin LI   /* Cache often-used quantities in local variables to help the compiler.  */
22618fd37a7SXin LI   char const **linbuf = current->linbuf;
22718fd37a7SXin LI   lin alloc_lines = current->alloc_lines;
22818fd37a7SXin LI   lin line = 0;
22918fd37a7SXin LI   lin linbuf_base = current->linbuf_base;
23018fd37a7SXin LI   lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
23118fd37a7SXin LI   struct equivclass *eqs = equivs;
23218fd37a7SXin LI   lin eqs_index = equivs_index;
23318fd37a7SXin LI   lin eqs_alloc = equivs_alloc;
23418fd37a7SXin LI   char const *suffix_begin = current->suffix_begin;
23518fd37a7SXin LI   char const *bufend = FILE_BUFFER (current) + current->buffered;
23618fd37a7SXin LI   bool diff_length_compare_anyway =
23718fd37a7SXin LI     ignore_white_space != IGNORE_NO_WHITE_SPACE;
23818fd37a7SXin LI   bool same_length_diff_contents_compare_anyway =
23918fd37a7SXin LI     diff_length_compare_anyway | ignore_case;
24018fd37a7SXin LI 
24118fd37a7SXin LI   while (p < suffix_begin)
24218fd37a7SXin LI     {
24318fd37a7SXin LI       char const *ip = p;
24418fd37a7SXin LI 
24518fd37a7SXin LI       h = 0;
24618fd37a7SXin LI 
24718fd37a7SXin LI       /* Hash this line until we find a newline.  */
24818fd37a7SXin LI       if (ignore_case)
24918fd37a7SXin LI 	switch (ignore_white_space)
25018fd37a7SXin LI 	  {
25118fd37a7SXin LI 	  case IGNORE_ALL_SPACE:
25218fd37a7SXin LI 	    while ((c = *p++) != '\n')
25318fd37a7SXin LI 	      if (! isspace (c))
25418fd37a7SXin LI 		h = HASH (h, tolower (c));
25518fd37a7SXin LI 	    break;
25618fd37a7SXin LI 
25718fd37a7SXin LI 	  case IGNORE_SPACE_CHANGE:
25818fd37a7SXin LI 	    while ((c = *p++) != '\n')
25918fd37a7SXin LI 	      {
26018fd37a7SXin LI 		if (isspace (c))
26118fd37a7SXin LI 		  {
26218fd37a7SXin LI 		    do
26318fd37a7SXin LI 		      if ((c = *p++) == '\n')
26418fd37a7SXin LI 			goto hashing_done;
26518fd37a7SXin LI 		    while (isspace (c));
26618fd37a7SXin LI 
26718fd37a7SXin LI 		    h = HASH (h, ' ');
26818fd37a7SXin LI 		  }
26918fd37a7SXin LI 
27018fd37a7SXin LI 		/* C is now the first non-space.  */
27118fd37a7SXin LI 		h = HASH (h, tolower (c));
27218fd37a7SXin LI 	      }
27318fd37a7SXin LI 	    break;
27418fd37a7SXin LI 
27518fd37a7SXin LI 	  case IGNORE_TAB_EXPANSION:
27618fd37a7SXin LI 	    {
27718fd37a7SXin LI 	      size_t column = 0;
27818fd37a7SXin LI 	      while ((c = *p++) != '\n')
27918fd37a7SXin LI 		{
28018fd37a7SXin LI 		  size_t repetitions = 1;
28118fd37a7SXin LI 
28218fd37a7SXin LI 		  switch (c)
28318fd37a7SXin LI 		    {
28418fd37a7SXin LI 		    case '\b':
28518fd37a7SXin LI 		      column -= 0 < column;
28618fd37a7SXin LI 		      break;
28718fd37a7SXin LI 
28818fd37a7SXin LI 		    case '\t':
28918fd37a7SXin LI 		      c = ' ';
29018fd37a7SXin LI 		      repetitions = tabsize - column % tabsize;
29118fd37a7SXin LI 		      column = (column + repetitions < column
29218fd37a7SXin LI 				? 0
29318fd37a7SXin LI 				: column + repetitions);
29418fd37a7SXin LI 		      break;
29518fd37a7SXin LI 
29618fd37a7SXin LI 		    case '\r':
29718fd37a7SXin LI 		      column = 0;
29818fd37a7SXin LI 		      break;
29918fd37a7SXin LI 
30018fd37a7SXin LI 		    default:
30118fd37a7SXin LI 		      c = tolower (c);
30218fd37a7SXin LI 		      column++;
30318fd37a7SXin LI 		      break;
30418fd37a7SXin LI 		    }
30518fd37a7SXin LI 
30618fd37a7SXin LI 		  do
30718fd37a7SXin LI 		    h = HASH (h, c);
30818fd37a7SXin LI 		  while (--repetitions != 0);
30918fd37a7SXin LI 		}
31018fd37a7SXin LI 	    }
31118fd37a7SXin LI 	    break;
31218fd37a7SXin LI 
31318fd37a7SXin LI 	  default:
31418fd37a7SXin LI 	    while ((c = *p++) != '\n')
31518fd37a7SXin LI 	      h = HASH (h, tolower (c));
31618fd37a7SXin LI 	    break;
31718fd37a7SXin LI 	  }
31818fd37a7SXin LI       else
31918fd37a7SXin LI 	switch (ignore_white_space)
32018fd37a7SXin LI 	  {
32118fd37a7SXin LI 	  case IGNORE_ALL_SPACE:
32218fd37a7SXin LI 	    while ((c = *p++) != '\n')
32318fd37a7SXin LI 	      if (! isspace (c))
32418fd37a7SXin LI 		h = HASH (h, c);
32518fd37a7SXin LI 	    break;
32618fd37a7SXin LI 
32718fd37a7SXin LI 	  case IGNORE_SPACE_CHANGE:
32818fd37a7SXin LI 	    while ((c = *p++) != '\n')
32918fd37a7SXin LI 	      {
33018fd37a7SXin LI 		if (isspace (c))
33118fd37a7SXin LI 		  {
33218fd37a7SXin LI 		    do
33318fd37a7SXin LI 		      if ((c = *p++) == '\n')
33418fd37a7SXin LI 			goto hashing_done;
33518fd37a7SXin LI 		    while (isspace (c));
33618fd37a7SXin LI 
33718fd37a7SXin LI 		    h = HASH (h, ' ');
33818fd37a7SXin LI 		  }
33918fd37a7SXin LI 
34018fd37a7SXin LI 		/* C is now the first non-space.  */
34118fd37a7SXin LI 		h = HASH (h, c);
34218fd37a7SXin LI 	      }
34318fd37a7SXin LI 	    break;
34418fd37a7SXin LI 
34518fd37a7SXin LI 	  case IGNORE_TAB_EXPANSION:
34618fd37a7SXin LI 	    {
34718fd37a7SXin LI 	      size_t column = 0;
34818fd37a7SXin LI 	      while ((c = *p++) != '\n')
34918fd37a7SXin LI 		{
35018fd37a7SXin LI 		  size_t repetitions = 1;
35118fd37a7SXin LI 
35218fd37a7SXin LI 		  switch (c)
35318fd37a7SXin LI 		    {
35418fd37a7SXin LI 		    case '\b':
35518fd37a7SXin LI 		      column -= 0 < column;
35618fd37a7SXin LI 		      break;
35718fd37a7SXin LI 
35818fd37a7SXin LI 		    case '\t':
35918fd37a7SXin LI 		      c = ' ';
36018fd37a7SXin LI 		      repetitions = tabsize - column % tabsize;
36118fd37a7SXin LI 		      column = (column + repetitions < column
36218fd37a7SXin LI 				? 0
36318fd37a7SXin LI 				: column + repetitions);
36418fd37a7SXin LI 		      break;
36518fd37a7SXin LI 
36618fd37a7SXin LI 		    case '\r':
36718fd37a7SXin LI 		      column = 0;
36818fd37a7SXin LI 		      break;
36918fd37a7SXin LI 
37018fd37a7SXin LI 		    default:
37118fd37a7SXin LI 		      column++;
37218fd37a7SXin LI 		      break;
37318fd37a7SXin LI 		    }
37418fd37a7SXin LI 
37518fd37a7SXin LI 		  do
37618fd37a7SXin LI 		    h = HASH (h, c);
37718fd37a7SXin LI 		  while (--repetitions != 0);
37818fd37a7SXin LI 		}
37918fd37a7SXin LI 	    }
38018fd37a7SXin LI 	    break;
38118fd37a7SXin LI 
38218fd37a7SXin LI 	  default:
38318fd37a7SXin LI 	    while ((c = *p++) != '\n')
38418fd37a7SXin LI 	      h = HASH (h, c);
38518fd37a7SXin LI 	    break;
38618fd37a7SXin LI 	  }
38718fd37a7SXin LI 
38818fd37a7SXin LI    hashing_done:;
38918fd37a7SXin LI 
39018fd37a7SXin LI       bucket = &buckets[h % nbuckets];
39118fd37a7SXin LI       length = p - ip - 1;
39218fd37a7SXin LI 
39318fd37a7SXin LI       if (p == bufend
39418fd37a7SXin LI 	  && current->missing_newline
39518fd37a7SXin LI 	  && ROBUST_OUTPUT_STYLE (output_style))
39618fd37a7SXin LI 	{
39718fd37a7SXin LI 	  /* This line is incomplete.  If this is significant,
39818fd37a7SXin LI 	     put the line into buckets[-1].  */
39918fd37a7SXin LI 	  if (ignore_white_space < IGNORE_SPACE_CHANGE)
40018fd37a7SXin LI 	    bucket = &buckets[-1];
40118fd37a7SXin LI 
40218fd37a7SXin LI 	  /* Omit the inserted newline when computing linbuf later.  */
40318fd37a7SXin LI 	  p--;
40418fd37a7SXin LI 	  bufend = suffix_begin = p;
40518fd37a7SXin LI 	}
40618fd37a7SXin LI 
40718fd37a7SXin LI       for (i = *bucket;  ;  i = eqs[i].next)
40818fd37a7SXin LI 	if (!i)
40918fd37a7SXin LI 	  {
41018fd37a7SXin LI 	    /* Create a new equivalence class in this bucket.  */
41118fd37a7SXin LI 	    i = eqs_index++;
41218fd37a7SXin LI 	    if (i == eqs_alloc)
41318fd37a7SXin LI 	      {
41418fd37a7SXin LI 		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
41518fd37a7SXin LI 		  xalloc_die ();
41618fd37a7SXin LI 		eqs_alloc *= 2;
41718fd37a7SXin LI 		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
41818fd37a7SXin LI 	      }
41918fd37a7SXin LI 	    eqs[i].next = *bucket;
42018fd37a7SXin LI 	    eqs[i].hash = h;
42118fd37a7SXin LI 	    eqs[i].line = ip;
42218fd37a7SXin LI 	    eqs[i].length = length;
42318fd37a7SXin LI 	    *bucket = i;
42418fd37a7SXin LI 	    break;
42518fd37a7SXin LI 	  }
42618fd37a7SXin LI 	else if (eqs[i].hash == h)
42718fd37a7SXin LI 	  {
42818fd37a7SXin LI 	    char const *eqline = eqs[i].line;
42918fd37a7SXin LI 
43018fd37a7SXin LI 	    /* Reuse existing class if lines_differ reports the lines
43118fd37a7SXin LI                equal.  */
43218fd37a7SXin LI 	    if (eqs[i].length == length)
43318fd37a7SXin LI 	      {
43418fd37a7SXin LI 		/* Reuse existing equivalence class if the lines are identical.
43518fd37a7SXin LI 		   This detects the common case of exact identity
43618fd37a7SXin LI 		   faster than lines_differ would.  */
43718fd37a7SXin LI 		if (memcmp (eqline, ip, length) == 0)
43818fd37a7SXin LI 		  break;
43918fd37a7SXin LI 		if (!same_length_diff_contents_compare_anyway)
44018fd37a7SXin LI 		  continue;
44118fd37a7SXin LI 	      }
44218fd37a7SXin LI 	    else if (!diff_length_compare_anyway)
44318fd37a7SXin LI 	      continue;
44418fd37a7SXin LI 
44518fd37a7SXin LI 	    if (! lines_differ (eqline, ip))
44618fd37a7SXin LI 	      break;
44718fd37a7SXin LI 	  }
44818fd37a7SXin LI 
44918fd37a7SXin LI       /* Maybe increase the size of the line table.  */
45018fd37a7SXin LI       if (line == alloc_lines)
45118fd37a7SXin LI 	{
45218fd37a7SXin LI 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
45318fd37a7SXin LI 	  if (PTRDIFF_MAX / 3 <= alloc_lines
45418fd37a7SXin LI 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
45518fd37a7SXin LI 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
45618fd37a7SXin LI 	    xalloc_die ();
45718fd37a7SXin LI 	  alloc_lines = 2 * alloc_lines - linbuf_base;
45818fd37a7SXin LI 	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
45918fd37a7SXin LI 	  linbuf += linbuf_base;
46018fd37a7SXin LI 	  linbuf = xrealloc (linbuf,
46118fd37a7SXin LI 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
46218fd37a7SXin LI 	  linbuf -= linbuf_base;
46318fd37a7SXin LI 	}
46418fd37a7SXin LI       linbuf[line] = ip;
46518fd37a7SXin LI       cureqs[line] = i;
46618fd37a7SXin LI       ++line;
46718fd37a7SXin LI     }
46818fd37a7SXin LI 
46918fd37a7SXin LI   current->buffered_lines = line;
47018fd37a7SXin LI 
47118fd37a7SXin LI   for (i = 0;  ;  i++)
47218fd37a7SXin LI     {
47318fd37a7SXin LI       /* Record the line start for lines in the suffix that we care about.
47418fd37a7SXin LI 	 Record one more line start than lines,
47518fd37a7SXin LI 	 so that we can compute the length of any buffered line.  */
47618fd37a7SXin LI       if (line == alloc_lines)
47718fd37a7SXin LI 	{
47818fd37a7SXin LI 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
47918fd37a7SXin LI 	  if (PTRDIFF_MAX / 3 <= alloc_lines
48018fd37a7SXin LI 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
48118fd37a7SXin LI 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
48218fd37a7SXin LI 	    xalloc_die ();
48318fd37a7SXin LI 	  alloc_lines = 2 * alloc_lines - linbuf_base;
48418fd37a7SXin LI 	  linbuf += linbuf_base;
48518fd37a7SXin LI 	  linbuf = xrealloc (linbuf,
48618fd37a7SXin LI 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
48718fd37a7SXin LI 	  linbuf -= linbuf_base;
48818fd37a7SXin LI 	}
48918fd37a7SXin LI       linbuf[line] = p;
49018fd37a7SXin LI 
49118fd37a7SXin LI       if (p == bufend)
49218fd37a7SXin LI 	break;
49318fd37a7SXin LI 
49418fd37a7SXin LI       if (context <= i && no_diff_means_no_output)
49518fd37a7SXin LI 	break;
49618fd37a7SXin LI 
49718fd37a7SXin LI       line++;
49818fd37a7SXin LI 
49918fd37a7SXin LI       while (*p++ != '\n')
50018fd37a7SXin LI 	continue;
50118fd37a7SXin LI     }
50218fd37a7SXin LI 
50318fd37a7SXin LI   /* Done with cache in local variables.  */
50418fd37a7SXin LI   current->linbuf = linbuf;
50518fd37a7SXin LI   current->valid_lines = line;
50618fd37a7SXin LI   current->alloc_lines = alloc_lines;
50718fd37a7SXin LI   current->equivs = cureqs;
50818fd37a7SXin LI   equivs = eqs;
50918fd37a7SXin LI   equivs_alloc = eqs_alloc;
51018fd37a7SXin LI   equivs_index = eqs_index;
51118fd37a7SXin LI }
51218fd37a7SXin LI 
51318fd37a7SXin LI /* Prepare the text.  Make sure the text end is initialized.
51418fd37a7SXin LI    Make sure text ends in a newline,
51518fd37a7SXin LI    but remember that we had to add one.
51618fd37a7SXin LI    Strip trailing CRs, if that was requested.  */
51718fd37a7SXin LI 
51818fd37a7SXin LI static void
prepare_text(struct file_data * current)51918fd37a7SXin LI prepare_text (struct file_data *current)
52018fd37a7SXin LI {
52118fd37a7SXin LI   size_t buffered = current->buffered;
52218fd37a7SXin LI   char *p = FILE_BUFFER (current);
52318fd37a7SXin LI   char *dst;
52418fd37a7SXin LI 
52518fd37a7SXin LI   if (buffered == 0 || p[buffered - 1] == '\n')
52618fd37a7SXin LI     current->missing_newline = false;
52718fd37a7SXin LI   else
52818fd37a7SXin LI     {
52918fd37a7SXin LI       p[buffered++] = '\n';
53018fd37a7SXin LI       current->missing_newline = true;
53118fd37a7SXin LI     }
53218fd37a7SXin LI 
53318fd37a7SXin LI   if (!p)
53418fd37a7SXin LI     return;
53518fd37a7SXin LI 
53618fd37a7SXin LI   /* Don't use uninitialized storage when planting or using sentinels.  */
53718fd37a7SXin LI   memset (p + buffered, 0, sizeof (word));
53818fd37a7SXin LI 
53918fd37a7SXin LI   if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
54018fd37a7SXin LI     {
54118fd37a7SXin LI       char const *src = dst;
54218fd37a7SXin LI       char const *srclim = p + buffered;
54318fd37a7SXin LI 
54418fd37a7SXin LI       do
54518fd37a7SXin LI 	dst += ! ((*dst = *src++) == '\r' && *src == '\n');
54618fd37a7SXin LI       while (src < srclim);
54718fd37a7SXin LI 
54818fd37a7SXin LI       buffered -= src - dst;
54918fd37a7SXin LI     }
55018fd37a7SXin LI 
55118fd37a7SXin LI   current->buffered = buffered;
55218fd37a7SXin LI }
55318fd37a7SXin LI 
55418fd37a7SXin LI /* We have found N lines in a buffer of size S; guess the
55518fd37a7SXin LI    proportionate number of lines that will be found in a buffer of
55618fd37a7SXin LI    size T.  However, do not guess a number of lines so large that the
55718fd37a7SXin LI    resulting line table might cause overflow in size calculations.  */
55818fd37a7SXin LI static lin
guess_lines(lin n,size_t s,size_t t)55918fd37a7SXin LI guess_lines (lin n, size_t s, size_t t)
56018fd37a7SXin LI {
56118fd37a7SXin LI   size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
56218fd37a7SXin LI   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
56318fd37a7SXin LI   return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
56418fd37a7SXin LI }
56518fd37a7SXin LI 
56618fd37a7SXin LI /* Given a vector of two file_data objects, find the identical
56718fd37a7SXin LI    prefixes and suffixes of each object.  */
56818fd37a7SXin LI 
56918fd37a7SXin LI static void
find_identical_ends(struct file_data filevec[])57018fd37a7SXin LI find_identical_ends (struct file_data filevec[])
57118fd37a7SXin LI {
57218fd37a7SXin LI   word *w0, *w1;
57318fd37a7SXin LI   char *p0, *p1, *buffer0, *buffer1;
57418fd37a7SXin LI   char const *end0, *beg0;
57518fd37a7SXin LI   char const **linbuf0, **linbuf1;
57618fd37a7SXin LI   lin i, lines;
57718fd37a7SXin LI   size_t n0, n1;
57818fd37a7SXin LI   lin alloc_lines0, alloc_lines1;
57918fd37a7SXin LI   lin buffered_prefix, prefix_count, prefix_mask;
58018fd37a7SXin LI   lin middle_guess, suffix_guess;
58118fd37a7SXin LI 
58218fd37a7SXin LI   slurp (&filevec[0]);
58318fd37a7SXin LI   prepare_text (&filevec[0]);
58418fd37a7SXin LI   if (filevec[0].desc != filevec[1].desc)
58518fd37a7SXin LI     {
58618fd37a7SXin LI       slurp (&filevec[1]);
58718fd37a7SXin LI       prepare_text (&filevec[1]);
58818fd37a7SXin LI     }
58918fd37a7SXin LI   else
59018fd37a7SXin LI     {
59118fd37a7SXin LI       filevec[1].buffer = filevec[0].buffer;
59218fd37a7SXin LI       filevec[1].bufsize = filevec[0].bufsize;
59318fd37a7SXin LI       filevec[1].buffered = filevec[0].buffered;
59418fd37a7SXin LI       filevec[1].missing_newline = filevec[0].missing_newline;
59518fd37a7SXin LI     }
59618fd37a7SXin LI 
59718fd37a7SXin LI   /* Find identical prefix.  */
59818fd37a7SXin LI 
59918fd37a7SXin LI   w0 = filevec[0].buffer;
60018fd37a7SXin LI   w1 = filevec[1].buffer;
60118fd37a7SXin LI   p0 = buffer0 = (char *) w0;
60218fd37a7SXin LI   p1 = buffer1 = (char *) w1;
60318fd37a7SXin LI   n0 = filevec[0].buffered;
60418fd37a7SXin LI   n1 = filevec[1].buffered;
60518fd37a7SXin LI 
60618fd37a7SXin LI   if (p0 == p1)
60718fd37a7SXin LI     /* The buffers are the same; sentinels won't work.  */
60818fd37a7SXin LI     p0 = p1 += n1;
60918fd37a7SXin LI   else
61018fd37a7SXin LI     {
61118fd37a7SXin LI       /* Insert end sentinels, in this case characters that are guaranteed
61218fd37a7SXin LI 	 to make the equality test false, and thus terminate the loop.  */
61318fd37a7SXin LI 
61418fd37a7SXin LI       if (n0 < n1)
61518fd37a7SXin LI 	p0[n0] = ~p1[n0];
61618fd37a7SXin LI       else
61718fd37a7SXin LI 	p1[n1] = ~p0[n1];
61818fd37a7SXin LI 
61918fd37a7SXin LI       /* Loop until first mismatch, or to the sentinel characters.  */
62018fd37a7SXin LI 
62118fd37a7SXin LI       /* Compare a word at a time for speed.  */
62218fd37a7SXin LI       while (*w0 == *w1)
62318fd37a7SXin LI 	w0++, w1++;
62418fd37a7SXin LI 
62518fd37a7SXin LI       /* Do the last few bytes of comparison a byte at a time.  */
62618fd37a7SXin LI       p0 = (char *) w0;
62718fd37a7SXin LI       p1 = (char *) w1;
62818fd37a7SXin LI       while (*p0 == *p1)
62918fd37a7SXin LI 	p0++, p1++;
63018fd37a7SXin LI 
63118fd37a7SXin LI       /* Don't mistakenly count missing newline as part of prefix.  */
63218fd37a7SXin LI       if (ROBUST_OUTPUT_STYLE (output_style)
63318fd37a7SXin LI 	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
63418fd37a7SXin LI 	      !=
63518fd37a7SXin LI 	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
63618fd37a7SXin LI 	p0--, p1--;
63718fd37a7SXin LI     }
63818fd37a7SXin LI 
63918fd37a7SXin LI   /* Now P0 and P1 point at the first nonmatching characters.  */
64018fd37a7SXin LI 
64118fd37a7SXin LI   /* Skip back to last line-beginning in the prefix,
64218fd37a7SXin LI      and then discard up to HORIZON_LINES lines from the prefix.  */
64318fd37a7SXin LI   i = horizon_lines;
64418fd37a7SXin LI   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
64518fd37a7SXin LI     p0--, p1--;
64618fd37a7SXin LI 
64718fd37a7SXin LI   /* Record the prefix.  */
64818fd37a7SXin LI   filevec[0].prefix_end = p0;
64918fd37a7SXin LI   filevec[1].prefix_end = p1;
65018fd37a7SXin LI 
65118fd37a7SXin LI   /* Find identical suffix.  */
65218fd37a7SXin LI 
65318fd37a7SXin LI   /* P0 and P1 point beyond the last chars not yet compared.  */
65418fd37a7SXin LI   p0 = buffer0 + n0;
65518fd37a7SXin LI   p1 = buffer1 + n1;
65618fd37a7SXin LI 
65718fd37a7SXin LI   if (! ROBUST_OUTPUT_STYLE (output_style)
65818fd37a7SXin LI       || filevec[0].missing_newline == filevec[1].missing_newline)
65918fd37a7SXin LI     {
66018fd37a7SXin LI       end0 = p0;	/* Addr of last char in file 0.  */
66118fd37a7SXin LI 
66218fd37a7SXin LI       /* Get value of P0 at which we should stop scanning backward:
66318fd37a7SXin LI 	 this is when either P0 or P1 points just past the last char
66418fd37a7SXin LI 	 of the identical prefix.  */
66518fd37a7SXin LI       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
66618fd37a7SXin LI 
66718fd37a7SXin LI       /* Scan back until chars don't match or we reach that point.  */
66818fd37a7SXin LI       for (; p0 != beg0; p0--, p1--)
66918fd37a7SXin LI 	if (*p0 != *p1)
67018fd37a7SXin LI 	  {
67118fd37a7SXin LI 	    /* Point at the first char of the matching suffix.  */
67218fd37a7SXin LI 	    beg0 = p0;
67318fd37a7SXin LI 	    break;
67418fd37a7SXin LI 	  }
67518fd37a7SXin LI 
67618fd37a7SXin LI       /* Are we at a line-beginning in both files?  If not, add the rest of
67718fd37a7SXin LI 	 this line to the main body.  Discard up to HORIZON_LINES lines from
67818fd37a7SXin LI 	 the identical suffix.  Also, discard one extra line,
67918fd37a7SXin LI 	 because shift_boundaries may need it.  */
68018fd37a7SXin LI       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
68118fd37a7SXin LI 			    &&
68218fd37a7SXin LI 			    (buffer1 == p1 || p1[-1] == '\n'));
68318fd37a7SXin LI       while (i-- && p0 != end0)
68418fd37a7SXin LI 	while (*p0++ != '\n')
68518fd37a7SXin LI 	  continue;
68618fd37a7SXin LI 
68718fd37a7SXin LI       p1 += p0 - beg0;
68818fd37a7SXin LI     }
68918fd37a7SXin LI 
69018fd37a7SXin LI   /* Record the suffix.  */
69118fd37a7SXin LI   filevec[0].suffix_begin = p0;
69218fd37a7SXin LI   filevec[1].suffix_begin = p1;
69318fd37a7SXin LI 
69418fd37a7SXin LI   /* Calculate number of lines of prefix to save.
69518fd37a7SXin LI 
69618fd37a7SXin LI      prefix_count == 0 means save the whole prefix;
69718fd37a7SXin LI      we need this for options like -D that output the whole file,
69818fd37a7SXin LI      or for enormous contexts (to avoid worrying about arithmetic overflow).
69918fd37a7SXin LI      We also need it for options like -F that output some preceding line;
70018fd37a7SXin LI      at least we will need to find the last few lines,
70118fd37a7SXin LI      but since we don't know how many, it's easiest to find them all.
70218fd37a7SXin LI 
70318fd37a7SXin LI      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
70418fd37a7SXin LI      of the line buffer; they'll be moved to the proper location later.
70518fd37a7SXin LI      Handle 1 more line than the context says (because we count 1 too many),
70618fd37a7SXin LI      rounded up to the next power of 2 to speed index computation.  */
70718fd37a7SXin LI 
70818fd37a7SXin LI   if (no_diff_means_no_output && ! function_regexp.fastmap
70918fd37a7SXin LI       && context < LIN_MAX / 4 && context < n0)
71018fd37a7SXin LI     {
71118fd37a7SXin LI       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
71218fd37a7SXin LI       suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
71318fd37a7SXin LI       for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
71418fd37a7SXin LI 	continue;
71518fd37a7SXin LI       alloc_lines0 = (prefix_count + middle_guess
71618fd37a7SXin LI 		      + MIN (context, suffix_guess));
71718fd37a7SXin LI     }
71818fd37a7SXin LI   else
71918fd37a7SXin LI     {
72018fd37a7SXin LI       prefix_count = 0;
72118fd37a7SXin LI       alloc_lines0 = guess_lines (0, 0, n0);
72218fd37a7SXin LI     }
72318fd37a7SXin LI 
72418fd37a7SXin LI   prefix_mask = prefix_count - 1;
72518fd37a7SXin LI   lines = 0;
72618fd37a7SXin LI   linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
72718fd37a7SXin LI   p0 = buffer0;
72818fd37a7SXin LI 
72918fd37a7SXin LI   /* If the prefix is needed, find the prefix lines.  */
73018fd37a7SXin LI   if (! (no_diff_means_no_output
73118fd37a7SXin LI 	 && filevec[0].prefix_end == p0
73218fd37a7SXin LI 	 && filevec[1].prefix_end == p1))
73318fd37a7SXin LI     {
73418fd37a7SXin LI       end0 = filevec[0].prefix_end;
73518fd37a7SXin LI       while (p0 != end0)
73618fd37a7SXin LI 	{
73718fd37a7SXin LI 	  lin l = lines++ & prefix_mask;
73818fd37a7SXin LI 	  if (l == alloc_lines0)
73918fd37a7SXin LI 	    {
74018fd37a7SXin LI 	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
74118fd37a7SXin LI 		xalloc_die ();
74218fd37a7SXin LI 	      alloc_lines0 *= 2;
74318fd37a7SXin LI 	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
74418fd37a7SXin LI 	    }
74518fd37a7SXin LI 	  linbuf0[l] = p0;
74618fd37a7SXin LI 	  while (*p0++ != '\n')
74718fd37a7SXin LI 	    continue;
74818fd37a7SXin LI 	}
74918fd37a7SXin LI     }
75018fd37a7SXin LI   buffered_prefix = prefix_count && context < lines ? context : lines;
75118fd37a7SXin LI 
75218fd37a7SXin LI   /* Allocate line buffer 1.  */
75318fd37a7SXin LI 
75418fd37a7SXin LI   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
75518fd37a7SXin LI   suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
75618fd37a7SXin LI   alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
75718fd37a7SXin LI   if (alloc_lines1 < buffered_prefix
75818fd37a7SXin LI       || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
75918fd37a7SXin LI     xalloc_die ();
76018fd37a7SXin LI   linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
76118fd37a7SXin LI 
76218fd37a7SXin LI   if (buffered_prefix != lines)
76318fd37a7SXin LI     {
76418fd37a7SXin LI       /* Rotate prefix lines to proper location.  */
76518fd37a7SXin LI       for (i = 0;  i < buffered_prefix;  i++)
76618fd37a7SXin LI 	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
76718fd37a7SXin LI       for (i = 0;  i < buffered_prefix;  i++)
76818fd37a7SXin LI 	linbuf0[i] = linbuf1[i];
76918fd37a7SXin LI     }
77018fd37a7SXin LI 
77118fd37a7SXin LI   /* Initialize line buffer 1 from line buffer 0.  */
77218fd37a7SXin LI   for (i = 0; i < buffered_prefix; i++)
77318fd37a7SXin LI     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
77418fd37a7SXin LI 
77518fd37a7SXin LI   /* Record the line buffer, adjusted so that
77618fd37a7SXin LI      linbuf[0] points at the first differing line.  */
77718fd37a7SXin LI   filevec[0].linbuf = linbuf0 + buffered_prefix;
77818fd37a7SXin LI   filevec[1].linbuf = linbuf1 + buffered_prefix;
77918fd37a7SXin LI   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
78018fd37a7SXin LI   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
78118fd37a7SXin LI   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
78218fd37a7SXin LI   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
78318fd37a7SXin LI }
78418fd37a7SXin LI 
78518fd37a7SXin LI /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
78618fd37a7SXin LI    than 2**k.  This table is derived from Chris K. Caldwell's list
78718fd37a7SXin LI    <http://www.utm.edu/research/primes/lists/2small/>.  */
78818fd37a7SXin LI 
78918fd37a7SXin LI static unsigned char const prime_offset[] =
79018fd37a7SXin LI {
79118fd37a7SXin LI   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
79218fd37a7SXin LI   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
79318fd37a7SXin LI   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
79418fd37a7SXin LI   55, 93, 1, 57, 25
79518fd37a7SXin LI };
79618fd37a7SXin LI 
79718fd37a7SXin LI /* Verify that this host's size_t is not too wide for the above table.  */
79818fd37a7SXin LI 
79918fd37a7SXin LI verify (enough_prime_offsets,
80018fd37a7SXin LI 	sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
80118fd37a7SXin LI 
80218fd37a7SXin LI /* Given a vector of two file_data objects, read the file associated
80318fd37a7SXin LI    with each one, and build the table of equivalence classes.
80418fd37a7SXin LI    Return nonzero if either file appears to be a binary file.
80518fd37a7SXin LI    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
80618fd37a7SXin LI 
80718fd37a7SXin LI bool
read_files(struct file_data filevec[],bool pretend_binary)80818fd37a7SXin LI read_files (struct file_data filevec[], bool pretend_binary)
80918fd37a7SXin LI {
81018fd37a7SXin LI   int i;
81118fd37a7SXin LI   bool skip_test = text | pretend_binary;
81218fd37a7SXin LI   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
81318fd37a7SXin LI 
81418fd37a7SXin LI   if (filevec[0].desc != filevec[1].desc)
81518fd37a7SXin LI     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
81618fd37a7SXin LI   else
81718fd37a7SXin LI     {
81818fd37a7SXin LI       filevec[1].buffer = filevec[0].buffer;
81918fd37a7SXin LI       filevec[1].bufsize = filevec[0].bufsize;
82018fd37a7SXin LI       filevec[1].buffered = filevec[0].buffered;
82118fd37a7SXin LI     }
82218fd37a7SXin LI   if (appears_binary)
82318fd37a7SXin LI     {
82418fd37a7SXin LI       set_binary_mode (filevec[0].desc, true);
82518fd37a7SXin LI       set_binary_mode (filevec[1].desc, true);
82618fd37a7SXin LI       return true;
82718fd37a7SXin LI     }
82818fd37a7SXin LI 
82918fd37a7SXin LI   find_identical_ends (filevec);
83018fd37a7SXin LI 
83118fd37a7SXin LI   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
83218fd37a7SXin LI   if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
83318fd37a7SXin LI     xalloc_die ();
83418fd37a7SXin LI   equivs = xmalloc (equivs_alloc * sizeof *equivs);
83518fd37a7SXin LI   /* Equivalence class 0 is permanently safe for lines that were not
83618fd37a7SXin LI      hashed.  Real equivalence classes start at 1.  */
83718fd37a7SXin LI   equivs_index = 1;
83818fd37a7SXin LI 
83918fd37a7SXin LI   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
84018fd37a7SXin LI      number between 1/3 and 2/3 of the value of equiv_allocs,
84118fd37a7SXin LI      approximately.  */
84218fd37a7SXin LI   for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
84318fd37a7SXin LI     continue;
84418fd37a7SXin LI   nbuckets = ((size_t) 1 << i) - prime_offset[i];
84518fd37a7SXin LI   if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
84618fd37a7SXin LI     xalloc_die ();
84718fd37a7SXin LI   buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
84818fd37a7SXin LI   buckets++;
84918fd37a7SXin LI 
85018fd37a7SXin LI   for (i = 0; i < 2; i++)
85118fd37a7SXin LI     find_and_hash_each_line (&filevec[i]);
85218fd37a7SXin LI 
85318fd37a7SXin LI   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
85418fd37a7SXin LI 
85518fd37a7SXin LI   free (equivs);
85618fd37a7SXin LI   free (buckets - 1);
85718fd37a7SXin LI 
85818fd37a7SXin LI   return false;
85918fd37a7SXin LI }
860