1 /* File I/O for GNU DIFF.
2
3 Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
4 2004 Free Software Foundation, Inc.
5
6 This file is part of GNU DIFF.
7
8 GNU DIFF is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 GNU DIFF is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING.
20 If not, write to the Free Software Foundation,
21 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
22
23 #include "diff.h"
24 #include <cmpbuf.h>
25 #include <file-type.h>
26 #include <setmode.h>
27 #include <xalloc.h>
28
29 /* Rotate an unsigned value to the left. */
30 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
31
32 /* Given a hash value and a new character, return a new hash value. */
33 #define HASH(h, c) ((c) + ROL (h, 7))
34
35 /* The type of a hash value. */
36 typedef size_t hash_value;
37 verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
38
39 /* Lines are put into equivalence classes of lines that match in lines_differ.
40 Each equivalence class is represented by one of these structures,
41 but only while the classes are being computed.
42 Afterward, each class is represented by a number. */
43 struct equivclass
44 {
45 lin next; /* Next item in this bucket. */
46 hash_value hash; /* Hash of lines in this class. */
47 char const *line; /* A line that fits this class. */
48 size_t length; /* That line's length, not counting its newline. */
49 };
50
51 /* Hash-table: array of buckets, each being a chain of equivalence classes.
52 buckets[-1] is reserved for incomplete lines. */
53 static lin *buckets;
54
55 /* Number of buckets in the hash table array, not counting buckets[-1]. */
56 static size_t nbuckets;
57
58 /* Array in which the equivalence classes are allocated.
59 The bucket-chains go through the elements in this array.
60 The number of an equivalence class is its index in this array. */
61 static struct equivclass *equivs;
62
63 /* Index of first free element in the array `equivs'. */
64 static lin equivs_index;
65
66 /* Number of elements allocated in the array `equivs'. */
67 static lin equivs_alloc;
68
69 /* Read a block of data into a file buffer, checking for EOF and error. */
70
71 void
file_block_read(struct file_data * current,size_t size)72 file_block_read (struct file_data *current, size_t size)
73 {
74 if (size && ! current->eof)
75 {
76 size_t s = block_read (current->desc,
77 FILE_BUFFER (current) + current->buffered, size);
78 if (s == SIZE_MAX)
79 pfatal_with_name (current->name);
80 current->buffered += s;
81 current->eof = s < size;
82 }
83 }
84
85 /* Check for binary files and compare them for exact identity. */
86
87 /* Return 1 if BUF contains a non text character.
88 SIZE is the number of characters in BUF. */
89
90 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
91
92 /* Get ready to read the current file.
93 Return nonzero if SKIP_TEST is zero,
94 and if it appears to be a binary file. */
95
96 static bool
sip(struct file_data * current,bool skip_test)97 sip (struct file_data *current, bool skip_test)
98 {
99 /* If we have a nonexistent file at this stage, treat it as empty. */
100 if (current->desc < 0)
101 {
102 /* Leave room for a sentinel. */
103 current->bufsize = sizeof (word);
104 current->buffer = xmalloc (current->bufsize);
105 }
106 else
107 {
108 current->bufsize = buffer_lcm (sizeof (word),
109 STAT_BLOCKSIZE (current->stat),
110 PTRDIFF_MAX - 2 * sizeof (word));
111 current->buffer = xmalloc (current->bufsize);
112
113 if (! skip_test)
114 {
115 /* Check first part of file to see if it's a binary file. */
116
117 bool was_binary = set_binary_mode (current->desc, true);
118 off_t buffered;
119 file_block_read (current, current->bufsize);
120 buffered = current->buffered;
121
122 if (! was_binary)
123 {
124 /* Revert to text mode and seek back to the beginning to
125 reread the file. Use relative seek, since file
126 descriptors like stdin might not start at offset
127 zero. */
128
129 if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
130 pfatal_with_name (current->name);
131 set_binary_mode (current->desc, false);
132 current->buffered = 0;
133 current->eof = false;
134 }
135
136 return binary_file_p (current->buffer, buffered);
137 }
138 }
139
140 current->buffered = 0;
141 current->eof = false;
142 return false;
143 }
144
145 /* Slurp the rest of the current file completely into memory. */
146
147 static void
slurp(struct file_data * current)148 slurp (struct file_data *current)
149 {
150 size_t cc;
151
152 if (current->desc < 0)
153 {
154 /* The file is nonexistent. */
155 return;
156 }
157
158 if (S_ISREG (current->stat.st_mode))
159 {
160 /* It's a regular file; slurp in the rest all at once. */
161
162 /* Get the size out of the stat block.
163 Allocate just enough room for appended newline plus word sentinel,
164 plus word-alignment since we want the buffer word-aligned. */
165 size_t file_size = current->stat.st_size;
166 cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
167 if (file_size != current->stat.st_size || cc < file_size
168 || PTRDIFF_MAX <= cc)
169 xalloc_die ();
170
171 if (current->bufsize < cc)
172 {
173 current->bufsize = cc;
174 current->buffer = xrealloc (current->buffer, cc);
175 }
176
177 /* Try to read at least 1 more byte than the size indicates, to
178 detect whether the file is growing. This is a nicety for
179 users who run 'diff' on files while they are changing. */
180
181 if (current->buffered <= file_size)
182 {
183 file_block_read (current, file_size + 1 - current->buffered);
184 if (current->buffered <= file_size)
185 return;
186 }
187 }
188
189 /* It's not a regular file, or it's a growing regular file; read it,
190 growing the buffer as needed. */
191
192 file_block_read (current, current->bufsize - current->buffered);
193
194 if (current->buffered)
195 {
196 while (current->buffered == current->bufsize)
197 {
198 if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
199 xalloc_die ();
200 current->bufsize *= 2;
201 current->buffer = xrealloc (current->buffer, current->bufsize);
202 file_block_read (current, current->bufsize - current->buffered);
203 }
204
205 /* Allocate just enough room for appended newline plus word
206 sentinel, plus word-alignment. */
207 cc = current->buffered + 2 * sizeof (word);
208 current->bufsize = cc - cc % sizeof (word);
209 current->buffer = xrealloc (current->buffer, current->bufsize);
210 }
211 }
212
213 /* Split the file into lines, simultaneously computing the equivalence
214 class for each line. */
215
216 static void
find_and_hash_each_line(struct file_data * current)217 find_and_hash_each_line (struct file_data *current)
218 {
219 hash_value h;
220 char const *p = current->prefix_end;
221 unsigned char c;
222 lin i, *bucket;
223 size_t length;
224
225 /* Cache often-used quantities in local variables to help the compiler. */
226 char const **linbuf = current->linbuf;
227 lin alloc_lines = current->alloc_lines;
228 lin line = 0;
229 lin linbuf_base = current->linbuf_base;
230 lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
231 struct equivclass *eqs = equivs;
232 lin eqs_index = equivs_index;
233 lin eqs_alloc = equivs_alloc;
234 char const *suffix_begin = current->suffix_begin;
235 char const *bufend = FILE_BUFFER (current) + current->buffered;
236 bool diff_length_compare_anyway =
237 ignore_white_space != IGNORE_NO_WHITE_SPACE;
238 bool same_length_diff_contents_compare_anyway =
239 diff_length_compare_anyway | ignore_case;
240
241 while (p < suffix_begin)
242 {
243 char const *ip = p;
244
245 h = 0;
246
247 /* Hash this line until we find a newline. */
248 if (ignore_case)
249 switch (ignore_white_space)
250 {
251 case IGNORE_ALL_SPACE:
252 while ((c = *p++) != '\n')
253 if (! isspace (c))
254 h = HASH (h, tolower (c));
255 break;
256
257 case IGNORE_SPACE_CHANGE:
258 while ((c = *p++) != '\n')
259 {
260 if (isspace (c))
261 {
262 do
263 if ((c = *p++) == '\n')
264 goto hashing_done;
265 while (isspace (c));
266
267 h = HASH (h, ' ');
268 }
269
270 /* C is now the first non-space. */
271 h = HASH (h, tolower (c));
272 }
273 break;
274
275 case IGNORE_TAB_EXPANSION:
276 {
277 size_t column = 0;
278 while ((c = *p++) != '\n')
279 {
280 size_t repetitions = 1;
281
282 switch (c)
283 {
284 case '\b':
285 column -= 0 < column;
286 break;
287
288 case '\t':
289 c = ' ';
290 repetitions = tabsize - column % tabsize;
291 column = (column + repetitions < column
292 ? 0
293 : column + repetitions);
294 break;
295
296 case '\r':
297 column = 0;
298 break;
299
300 default:
301 c = tolower (c);
302 column++;
303 break;
304 }
305
306 do
307 h = HASH (h, c);
308 while (--repetitions != 0);
309 }
310 }
311 break;
312
313 default:
314 while ((c = *p++) != '\n')
315 h = HASH (h, tolower (c));
316 break;
317 }
318 else
319 switch (ignore_white_space)
320 {
321 case IGNORE_ALL_SPACE:
322 while ((c = *p++) != '\n')
323 if (! isspace (c))
324 h = HASH (h, c);
325 break;
326
327 case IGNORE_SPACE_CHANGE:
328 while ((c = *p++) != '\n')
329 {
330 if (isspace (c))
331 {
332 do
333 if ((c = *p++) == '\n')
334 goto hashing_done;
335 while (isspace (c));
336
337 h = HASH (h, ' ');
338 }
339
340 /* C is now the first non-space. */
341 h = HASH (h, c);
342 }
343 break;
344
345 case IGNORE_TAB_EXPANSION:
346 {
347 size_t column = 0;
348 while ((c = *p++) != '\n')
349 {
350 size_t repetitions = 1;
351
352 switch (c)
353 {
354 case '\b':
355 column -= 0 < column;
356 break;
357
358 case '\t':
359 c = ' ';
360 repetitions = tabsize - column % tabsize;
361 column = (column + repetitions < column
362 ? 0
363 : column + repetitions);
364 break;
365
366 case '\r':
367 column = 0;
368 break;
369
370 default:
371 column++;
372 break;
373 }
374
375 do
376 h = HASH (h, c);
377 while (--repetitions != 0);
378 }
379 }
380 break;
381
382 default:
383 while ((c = *p++) != '\n')
384 h = HASH (h, c);
385 break;
386 }
387
388 hashing_done:;
389
390 bucket = &buckets[h % nbuckets];
391 length = p - ip - 1;
392
393 if (p == bufend
394 && current->missing_newline
395 && ROBUST_OUTPUT_STYLE (output_style))
396 {
397 /* This line is incomplete. If this is significant,
398 put the line into buckets[-1]. */
399 if (ignore_white_space < IGNORE_SPACE_CHANGE)
400 bucket = &buckets[-1];
401
402 /* Omit the inserted newline when computing linbuf later. */
403 p--;
404 bufend = suffix_begin = p;
405 }
406
407 for (i = *bucket; ; i = eqs[i].next)
408 if (!i)
409 {
410 /* Create a new equivalence class in this bucket. */
411 i = eqs_index++;
412 if (i == eqs_alloc)
413 {
414 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
415 xalloc_die ();
416 eqs_alloc *= 2;
417 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
418 }
419 eqs[i].next = *bucket;
420 eqs[i].hash = h;
421 eqs[i].line = ip;
422 eqs[i].length = length;
423 *bucket = i;
424 break;
425 }
426 else if (eqs[i].hash == h)
427 {
428 char const *eqline = eqs[i].line;
429
430 /* Reuse existing class if lines_differ reports the lines
431 equal. */
432 if (eqs[i].length == length)
433 {
434 /* Reuse existing equivalence class if the lines are identical.
435 This detects the common case of exact identity
436 faster than lines_differ would. */
437 if (memcmp (eqline, ip, length) == 0)
438 break;
439 if (!same_length_diff_contents_compare_anyway)
440 continue;
441 }
442 else if (!diff_length_compare_anyway)
443 continue;
444
445 if (! lines_differ (eqline, ip))
446 break;
447 }
448
449 /* Maybe increase the size of the line table. */
450 if (line == alloc_lines)
451 {
452 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
453 if (PTRDIFF_MAX / 3 <= alloc_lines
454 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
455 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
456 xalloc_die ();
457 alloc_lines = 2 * alloc_lines - linbuf_base;
458 cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
459 linbuf += linbuf_base;
460 linbuf = xrealloc (linbuf,
461 (alloc_lines - linbuf_base) * sizeof *linbuf);
462 linbuf -= linbuf_base;
463 }
464 linbuf[line] = ip;
465 cureqs[line] = i;
466 ++line;
467 }
468
469 current->buffered_lines = line;
470
471 for (i = 0; ; i++)
472 {
473 /* Record the line start for lines in the suffix that we care about.
474 Record one more line start than lines,
475 so that we can compute the length of any buffered line. */
476 if (line == alloc_lines)
477 {
478 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
479 if (PTRDIFF_MAX / 3 <= alloc_lines
480 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
481 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
482 xalloc_die ();
483 alloc_lines = 2 * alloc_lines - linbuf_base;
484 linbuf += linbuf_base;
485 linbuf = xrealloc (linbuf,
486 (alloc_lines - linbuf_base) * sizeof *linbuf);
487 linbuf -= linbuf_base;
488 }
489 linbuf[line] = p;
490
491 if (p == bufend)
492 break;
493
494 if (context <= i && no_diff_means_no_output)
495 break;
496
497 line++;
498
499 while (*p++ != '\n')
500 continue;
501 }
502
503 /* Done with cache in local variables. */
504 current->linbuf = linbuf;
505 current->valid_lines = line;
506 current->alloc_lines = alloc_lines;
507 current->equivs = cureqs;
508 equivs = eqs;
509 equivs_alloc = eqs_alloc;
510 equivs_index = eqs_index;
511 }
512
513 /* Prepare the text. Make sure the text end is initialized.
514 Make sure text ends in a newline,
515 but remember that we had to add one.
516 Strip trailing CRs, if that was requested. */
517
518 static void
prepare_text(struct file_data * current)519 prepare_text (struct file_data *current)
520 {
521 size_t buffered = current->buffered;
522 char *p = FILE_BUFFER (current);
523 char *dst;
524
525 if (buffered == 0 || p[buffered - 1] == '\n')
526 current->missing_newline = false;
527 else
528 {
529 p[buffered++] = '\n';
530 current->missing_newline = true;
531 }
532
533 if (!p)
534 return;
535
536 /* Don't use uninitialized storage when planting or using sentinels. */
537 memset (p + buffered, 0, sizeof (word));
538
539 if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
540 {
541 char const *src = dst;
542 char const *srclim = p + buffered;
543
544 do
545 dst += ! ((*dst = *src++) == '\r' && *src == '\n');
546 while (src < srclim);
547
548 buffered -= src - dst;
549 }
550
551 current->buffered = buffered;
552 }
553
554 /* We have found N lines in a buffer of size S; guess the
555 proportionate number of lines that will be found in a buffer of
556 size T. However, do not guess a number of lines so large that the
557 resulting line table might cause overflow in size calculations. */
558 static lin
guess_lines(lin n,size_t s,size_t t)559 guess_lines (lin n, size_t s, size_t t)
560 {
561 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
562 lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
563 return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
564 }
565
566 /* Given a vector of two file_data objects, find the identical
567 prefixes and suffixes of each object. */
568
569 static void
find_identical_ends(struct file_data filevec[])570 find_identical_ends (struct file_data filevec[])
571 {
572 word *w0, *w1;
573 char *p0, *p1, *buffer0, *buffer1;
574 char const *end0, *beg0;
575 char const **linbuf0, **linbuf1;
576 lin i, lines;
577 size_t n0, n1;
578 lin alloc_lines0, alloc_lines1;
579 lin buffered_prefix, prefix_count, prefix_mask;
580 lin middle_guess, suffix_guess;
581
582 slurp (&filevec[0]);
583 prepare_text (&filevec[0]);
584 if (filevec[0].desc != filevec[1].desc)
585 {
586 slurp (&filevec[1]);
587 prepare_text (&filevec[1]);
588 }
589 else
590 {
591 filevec[1].buffer = filevec[0].buffer;
592 filevec[1].bufsize = filevec[0].bufsize;
593 filevec[1].buffered = filevec[0].buffered;
594 filevec[1].missing_newline = filevec[0].missing_newline;
595 }
596
597 /* Find identical prefix. */
598
599 w0 = filevec[0].buffer;
600 w1 = filevec[1].buffer;
601 p0 = buffer0 = (char *) w0;
602 p1 = buffer1 = (char *) w1;
603 n0 = filevec[0].buffered;
604 n1 = filevec[1].buffered;
605
606 if (p0 == p1)
607 /* The buffers are the same; sentinels won't work. */
608 p0 = p1 += n1;
609 else
610 {
611 /* Insert end sentinels, in this case characters that are guaranteed
612 to make the equality test false, and thus terminate the loop. */
613
614 if (n0 < n1)
615 p0[n0] = ~p1[n0];
616 else
617 p1[n1] = ~p0[n1];
618
619 /* Loop until first mismatch, or to the sentinel characters. */
620
621 /* Compare a word at a time for speed. */
622 while (*w0 == *w1)
623 w0++, w1++;
624
625 /* Do the last few bytes of comparison a byte at a time. */
626 p0 = (char *) w0;
627 p1 = (char *) w1;
628 while (*p0 == *p1)
629 p0++, p1++;
630
631 /* Don't mistakenly count missing newline as part of prefix. */
632 if (ROBUST_OUTPUT_STYLE (output_style)
633 && ((buffer0 + n0 - filevec[0].missing_newline < p0)
634 !=
635 (buffer1 + n1 - filevec[1].missing_newline < p1)))
636 p0--, p1--;
637 }
638
639 /* Now P0 and P1 point at the first nonmatching characters. */
640
641 /* Skip back to last line-beginning in the prefix,
642 and then discard up to HORIZON_LINES lines from the prefix. */
643 i = horizon_lines;
644 while (p0 != buffer0 && (p0[-1] != '\n' || i--))
645 p0--, p1--;
646
647 /* Record the prefix. */
648 filevec[0].prefix_end = p0;
649 filevec[1].prefix_end = p1;
650
651 /* Find identical suffix. */
652
653 /* P0 and P1 point beyond the last chars not yet compared. */
654 p0 = buffer0 + n0;
655 p1 = buffer1 + n1;
656
657 if (! ROBUST_OUTPUT_STYLE (output_style)
658 || filevec[0].missing_newline == filevec[1].missing_newline)
659 {
660 end0 = p0; /* Addr of last char in file 0. */
661
662 /* Get value of P0 at which we should stop scanning backward:
663 this is when either P0 or P1 points just past the last char
664 of the identical prefix. */
665 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
666
667 /* Scan back until chars don't match or we reach that point. */
668 for (; p0 != beg0; p0--, p1--)
669 if (*p0 != *p1)
670 {
671 /* Point at the first char of the matching suffix. */
672 beg0 = p0;
673 break;
674 }
675
676 /* Are we at a line-beginning in both files? If not, add the rest of
677 this line to the main body. Discard up to HORIZON_LINES lines from
678 the identical suffix. Also, discard one extra line,
679 because shift_boundaries may need it. */
680 i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
681 &&
682 (buffer1 == p1 || p1[-1] == '\n'));
683 while (i-- && p0 != end0)
684 while (*p0++ != '\n')
685 continue;
686
687 p1 += p0 - beg0;
688 }
689
690 /* Record the suffix. */
691 filevec[0].suffix_begin = p0;
692 filevec[1].suffix_begin = p1;
693
694 /* Calculate number of lines of prefix to save.
695
696 prefix_count == 0 means save the whole prefix;
697 we need this for options like -D that output the whole file,
698 or for enormous contexts (to avoid worrying about arithmetic overflow).
699 We also need it for options like -F that output some preceding line;
700 at least we will need to find the last few lines,
701 but since we don't know how many, it's easiest to find them all.
702
703 Otherwise, prefix_count != 0. Save just prefix_count lines at start
704 of the line buffer; they'll be moved to the proper location later.
705 Handle 1 more line than the context says (because we count 1 too many),
706 rounded up to the next power of 2 to speed index computation. */
707
708 if (no_diff_means_no_output && ! function_regexp.fastmap
709 && context < LIN_MAX / 4 && context < n0)
710 {
711 middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
712 suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
713 for (prefix_count = 1; prefix_count <= context; prefix_count *= 2)
714 continue;
715 alloc_lines0 = (prefix_count + middle_guess
716 + MIN (context, suffix_guess));
717 }
718 else
719 {
720 prefix_count = 0;
721 alloc_lines0 = guess_lines (0, 0, n0);
722 }
723
724 prefix_mask = prefix_count - 1;
725 lines = 0;
726 linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
727 p0 = buffer0;
728
729 /* If the prefix is needed, find the prefix lines. */
730 if (! (no_diff_means_no_output
731 && filevec[0].prefix_end == p0
732 && filevec[1].prefix_end == p1))
733 {
734 end0 = filevec[0].prefix_end;
735 while (p0 != end0)
736 {
737 lin l = lines++ & prefix_mask;
738 if (l == alloc_lines0)
739 {
740 if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
741 xalloc_die ();
742 alloc_lines0 *= 2;
743 linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
744 }
745 linbuf0[l] = p0;
746 while (*p0++ != '\n')
747 continue;
748 }
749 }
750 buffered_prefix = prefix_count && context < lines ? context : lines;
751
752 /* Allocate line buffer 1. */
753
754 middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
755 suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
756 alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
757 if (alloc_lines1 < buffered_prefix
758 || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
759 xalloc_die ();
760 linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
761
762 if (buffered_prefix != lines)
763 {
764 /* Rotate prefix lines to proper location. */
765 for (i = 0; i < buffered_prefix; i++)
766 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
767 for (i = 0; i < buffered_prefix; i++)
768 linbuf0[i] = linbuf1[i];
769 }
770
771 /* Initialize line buffer 1 from line buffer 0. */
772 for (i = 0; i < buffered_prefix; i++)
773 linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
774
775 /* Record the line buffer, adjusted so that
776 linbuf[0] points at the first differing line. */
777 filevec[0].linbuf = linbuf0 + buffered_prefix;
778 filevec[1].linbuf = linbuf1 + buffered_prefix;
779 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
780 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
781 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
782 filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
783 }
784
785 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
786 than 2**k. This table is derived from Chris K. Caldwell's list
787 <http://www.utm.edu/research/primes/lists/2small/>. */
788
789 static unsigned char const prime_offset[] =
790 {
791 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
792 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
793 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
794 55, 93, 1, 57, 25
795 };
796
797 /* Verify that this host's size_t is not too wide for the above table. */
798
799 verify (enough_prime_offsets,
800 sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
801
802 /* Given a vector of two file_data objects, read the file associated
803 with each one, and build the table of equivalence classes.
804 Return nonzero if either file appears to be a binary file.
805 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
806
807 bool
read_files(struct file_data filevec[],bool pretend_binary)808 read_files (struct file_data filevec[], bool pretend_binary)
809 {
810 int i;
811 bool skip_test = text | pretend_binary;
812 bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
813
814 if (filevec[0].desc != filevec[1].desc)
815 appears_binary |= sip (&filevec[1], skip_test | appears_binary);
816 else
817 {
818 filevec[1].buffer = filevec[0].buffer;
819 filevec[1].bufsize = filevec[0].bufsize;
820 filevec[1].buffered = filevec[0].buffered;
821 }
822 if (appears_binary)
823 {
824 set_binary_mode (filevec[0].desc, true);
825 set_binary_mode (filevec[1].desc, true);
826 return true;
827 }
828
829 find_identical_ends (filevec);
830
831 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
832 if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
833 xalloc_die ();
834 equivs = xmalloc (equivs_alloc * sizeof *equivs);
835 /* Equivalence class 0 is permanently safe for lines that were not
836 hashed. Real equivalence classes start at 1. */
837 equivs_index = 1;
838
839 /* Allocate (one plus) a prime number of hash buckets. Use a prime
840 number between 1/3 and 2/3 of the value of equiv_allocs,
841 approximately. */
842 for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
843 continue;
844 nbuckets = ((size_t) 1 << i) - prime_offset[i];
845 if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
846 xalloc_die ();
847 buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
848 buckets++;
849
850 for (i = 0; i < 2; i++)
851 find_and_hash_each_line (&filevec[i]);
852
853 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
854
855 free (equivs);
856 free (buckets - 1);
857
858 return false;
859 }
860