1 /* File I/O for GNU DIFF. 2 3 Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002, 4 2004 Free Software Foundation, Inc. 5 6 This file is part of GNU DIFF. 7 8 GNU DIFF is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 2, or (at your option) 11 any later version. 12 13 GNU DIFF is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; see the file COPYING. 20 If not, write to the Free Software Foundation, 21 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ 22 23 #include "diff.h" 24 #include <cmpbuf.h> 25 #include <file-type.h> 26 #include <setmode.h> 27 #include <xalloc.h> 28 29 /* Rotate an unsigned value to the left. */ 30 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n))) 31 32 /* Given a hash value and a new character, return a new hash value. */ 33 #define HASH(h, c) ((c) + ROL (h, 7)) 34 35 /* The type of a hash value. */ 36 typedef size_t hash_value; 37 verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value)); 38 39 /* Lines are put into equivalence classes of lines that match in lines_differ. 40 Each equivalence class is represented by one of these structures, 41 but only while the classes are being computed. 42 Afterward, each class is represented by a number. */ 43 struct equivclass 44 { 45 lin next; /* Next item in this bucket. */ 46 hash_value hash; /* Hash of lines in this class. */ 47 char const *line; /* A line that fits this class. */ 48 size_t length; /* That line's length, not counting its newline. */ 49 }; 50 51 /* Hash-table: array of buckets, each being a chain of equivalence classes. 52 buckets[-1] is reserved for incomplete lines. */ 53 static lin *buckets; 54 55 /* Number of buckets in the hash table array, not counting buckets[-1]. */ 56 static size_t nbuckets; 57 58 /* Array in which the equivalence classes are allocated. 59 The bucket-chains go through the elements in this array. 60 The number of an equivalence class is its index in this array. */ 61 static struct equivclass *equivs; 62 63 /* Index of first free element in the array `equivs'. */ 64 static lin equivs_index; 65 66 /* Number of elements allocated in the array `equivs'. */ 67 static lin equivs_alloc; 68 69 /* Read a block of data into a file buffer, checking for EOF and error. */ 70 71 void 72 file_block_read (struct file_data *current, size_t size) 73 { 74 if (size && ! current->eof) 75 { 76 size_t s = block_read (current->desc, 77 FILE_BUFFER (current) + current->buffered, size); 78 if (s == SIZE_MAX) 79 pfatal_with_name (current->name); 80 current->buffered += s; 81 current->eof = s < size; 82 } 83 } 84 85 /* Check for binary files and compare them for exact identity. */ 86 87 /* Return 1 if BUF contains a non text character. 88 SIZE is the number of characters in BUF. */ 89 90 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0) 91 92 /* Get ready to read the current file. 93 Return nonzero if SKIP_TEST is zero, 94 and if it appears to be a binary file. */ 95 96 static bool 97 sip (struct file_data *current, bool skip_test) 98 { 99 /* If we have a nonexistent file at this stage, treat it as empty. */ 100 if (current->desc < 0) 101 { 102 /* Leave room for a sentinel. */ 103 current->bufsize = sizeof (word); 104 current->buffer = xmalloc (current->bufsize); 105 } 106 else 107 { 108 current->bufsize = buffer_lcm (sizeof (word), 109 STAT_BLOCKSIZE (current->stat), 110 PTRDIFF_MAX - 2 * sizeof (word)); 111 current->buffer = xmalloc (current->bufsize); 112 113 if (! skip_test) 114 { 115 /* Check first part of file to see if it's a binary file. */ 116 117 bool was_binary = set_binary_mode (current->desc, true); 118 off_t buffered; 119 file_block_read (current, current->bufsize); 120 buffered = current->buffered; 121 122 if (! was_binary) 123 { 124 /* Revert to text mode and seek back to the beginning to 125 reread the file. Use relative seek, since file 126 descriptors like stdin might not start at offset 127 zero. */ 128 129 if (lseek (current->desc, - buffered, SEEK_CUR) == -1) 130 pfatal_with_name (current->name); 131 set_binary_mode (current->desc, false); 132 current->buffered = 0; 133 current->eof = false; 134 } 135 136 return binary_file_p (current->buffer, buffered); 137 } 138 } 139 140 current->buffered = 0; 141 current->eof = false; 142 return false; 143 } 144 145 /* Slurp the rest of the current file completely into memory. */ 146 147 static void 148 slurp (struct file_data *current) 149 { 150 size_t cc; 151 152 if (current->desc < 0) 153 { 154 /* The file is nonexistent. */ 155 return; 156 } 157 158 if (S_ISREG (current->stat.st_mode)) 159 { 160 /* It's a regular file; slurp in the rest all at once. */ 161 162 /* Get the size out of the stat block. 163 Allocate just enough room for appended newline plus word sentinel, 164 plus word-alignment since we want the buffer word-aligned. */ 165 size_t file_size = current->stat.st_size; 166 cc = file_size + 2 * sizeof (word) - file_size % sizeof (word); 167 if (file_size != current->stat.st_size || cc < file_size 168 || PTRDIFF_MAX <= cc) 169 xalloc_die (); 170 171 if (current->bufsize < cc) 172 { 173 current->bufsize = cc; 174 current->buffer = xrealloc (current->buffer, cc); 175 } 176 177 /* Try to read at least 1 more byte than the size indicates, to 178 detect whether the file is growing. This is a nicety for 179 users who run 'diff' on files while they are changing. */ 180 181 if (current->buffered <= file_size) 182 { 183 file_block_read (current, file_size + 1 - current->buffered); 184 if (current->buffered <= file_size) 185 return; 186 } 187 } 188 189 /* It's not a regular file, or it's a growing regular file; read it, 190 growing the buffer as needed. */ 191 192 file_block_read (current, current->bufsize - current->buffered); 193 194 if (current->buffered) 195 { 196 while (current->buffered == current->bufsize) 197 { 198 if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize) 199 xalloc_die (); 200 current->bufsize *= 2; 201 current->buffer = xrealloc (current->buffer, current->bufsize); 202 file_block_read (current, current->bufsize - current->buffered); 203 } 204 205 /* Allocate just enough room for appended newline plus word 206 sentinel, plus word-alignment. */ 207 cc = current->buffered + 2 * sizeof (word); 208 current->bufsize = cc - cc % sizeof (word); 209 current->buffer = xrealloc (current->buffer, current->bufsize); 210 } 211 } 212 213 /* Split the file into lines, simultaneously computing the equivalence 214 class for each line. */ 215 216 static void 217 find_and_hash_each_line (struct file_data *current) 218 { 219 hash_value h; 220 char const *p = current->prefix_end; 221 unsigned char c; 222 lin i, *bucket; 223 size_t length; 224 225 /* Cache often-used quantities in local variables to help the compiler. */ 226 char const **linbuf = current->linbuf; 227 lin alloc_lines = current->alloc_lines; 228 lin line = 0; 229 lin linbuf_base = current->linbuf_base; 230 lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs); 231 struct equivclass *eqs = equivs; 232 lin eqs_index = equivs_index; 233 lin eqs_alloc = equivs_alloc; 234 char const *suffix_begin = current->suffix_begin; 235 char const *bufend = FILE_BUFFER (current) + current->buffered; 236 bool diff_length_compare_anyway = 237 ignore_white_space != IGNORE_NO_WHITE_SPACE; 238 bool same_length_diff_contents_compare_anyway = 239 diff_length_compare_anyway | ignore_case; 240 241 while (p < suffix_begin) 242 { 243 char const *ip = p; 244 245 h = 0; 246 247 /* Hash this line until we find a newline. */ 248 if (ignore_case) 249 switch (ignore_white_space) 250 { 251 case IGNORE_ALL_SPACE: 252 while ((c = *p++) != '\n') 253 if (! isspace (c)) 254 h = HASH (h, tolower (c)); 255 break; 256 257 case IGNORE_SPACE_CHANGE: 258 while ((c = *p++) != '\n') 259 { 260 if (isspace (c)) 261 { 262 do 263 if ((c = *p++) == '\n') 264 goto hashing_done; 265 while (isspace (c)); 266 267 h = HASH (h, ' '); 268 } 269 270 /* C is now the first non-space. */ 271 h = HASH (h, tolower (c)); 272 } 273 break; 274 275 case IGNORE_TAB_EXPANSION: 276 { 277 size_t column = 0; 278 while ((c = *p++) != '\n') 279 { 280 size_t repetitions = 1; 281 282 switch (c) 283 { 284 case '\b': 285 column -= 0 < column; 286 break; 287 288 case '\t': 289 c = ' '; 290 repetitions = tabsize - column % tabsize; 291 column = (column + repetitions < column 292 ? 0 293 : column + repetitions); 294 break; 295 296 case '\r': 297 column = 0; 298 break; 299 300 default: 301 c = tolower (c); 302 column++; 303 break; 304 } 305 306 do 307 h = HASH (h, c); 308 while (--repetitions != 0); 309 } 310 } 311 break; 312 313 default: 314 while ((c = *p++) != '\n') 315 h = HASH (h, tolower (c)); 316 break; 317 } 318 else 319 switch (ignore_white_space) 320 { 321 case IGNORE_ALL_SPACE: 322 while ((c = *p++) != '\n') 323 if (! isspace (c)) 324 h = HASH (h, c); 325 break; 326 327 case IGNORE_SPACE_CHANGE: 328 while ((c = *p++) != '\n') 329 { 330 if (isspace (c)) 331 { 332 do 333 if ((c = *p++) == '\n') 334 goto hashing_done; 335 while (isspace (c)); 336 337 h = HASH (h, ' '); 338 } 339 340 /* C is now the first non-space. */ 341 h = HASH (h, c); 342 } 343 break; 344 345 case IGNORE_TAB_EXPANSION: 346 { 347 size_t column = 0; 348 while ((c = *p++) != '\n') 349 { 350 size_t repetitions = 1; 351 352 switch (c) 353 { 354 case '\b': 355 column -= 0 < column; 356 break; 357 358 case '\t': 359 c = ' '; 360 repetitions = tabsize - column % tabsize; 361 column = (column + repetitions < column 362 ? 0 363 : column + repetitions); 364 break; 365 366 case '\r': 367 column = 0; 368 break; 369 370 default: 371 column++; 372 break; 373 } 374 375 do 376 h = HASH (h, c); 377 while (--repetitions != 0); 378 } 379 } 380 break; 381 382 default: 383 while ((c = *p++) != '\n') 384 h = HASH (h, c); 385 break; 386 } 387 388 hashing_done:; 389 390 bucket = &buckets[h % nbuckets]; 391 length = p - ip - 1; 392 393 if (p == bufend 394 && current->missing_newline 395 && ROBUST_OUTPUT_STYLE (output_style)) 396 { 397 /* This line is incomplete. If this is significant, 398 put the line into buckets[-1]. */ 399 if (ignore_white_space < IGNORE_SPACE_CHANGE) 400 bucket = &buckets[-1]; 401 402 /* Omit the inserted newline when computing linbuf later. */ 403 p--; 404 bufend = suffix_begin = p; 405 } 406 407 for (i = *bucket; ; i = eqs[i].next) 408 if (!i) 409 { 410 /* Create a new equivalence class in this bucket. */ 411 i = eqs_index++; 412 if (i == eqs_alloc) 413 { 414 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc) 415 xalloc_die (); 416 eqs_alloc *= 2; 417 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs); 418 } 419 eqs[i].next = *bucket; 420 eqs[i].hash = h; 421 eqs[i].line = ip; 422 eqs[i].length = length; 423 *bucket = i; 424 break; 425 } 426 else if (eqs[i].hash == h) 427 { 428 char const *eqline = eqs[i].line; 429 430 /* Reuse existing class if lines_differ reports the lines 431 equal. */ 432 if (eqs[i].length == length) 433 { 434 /* Reuse existing equivalence class if the lines are identical. 435 This detects the common case of exact identity 436 faster than lines_differ would. */ 437 if (memcmp (eqline, ip, length) == 0) 438 break; 439 if (!same_length_diff_contents_compare_anyway) 440 continue; 441 } 442 else if (!diff_length_compare_anyway) 443 continue; 444 445 if (! lines_differ (eqline, ip)) 446 break; 447 } 448 449 /* Maybe increase the size of the line table. */ 450 if (line == alloc_lines) 451 { 452 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 453 if (PTRDIFF_MAX / 3 <= alloc_lines 454 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base 455 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base) 456 xalloc_die (); 457 alloc_lines = 2 * alloc_lines - linbuf_base; 458 cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs); 459 linbuf += linbuf_base; 460 linbuf = xrealloc (linbuf, 461 (alloc_lines - linbuf_base) * sizeof *linbuf); 462 linbuf -= linbuf_base; 463 } 464 linbuf[line] = ip; 465 cureqs[line] = i; 466 ++line; 467 } 468 469 current->buffered_lines = line; 470 471 for (i = 0; ; i++) 472 { 473 /* Record the line start for lines in the suffix that we care about. 474 Record one more line start than lines, 475 so that we can compute the length of any buffered line. */ 476 if (line == alloc_lines) 477 { 478 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 479 if (PTRDIFF_MAX / 3 <= alloc_lines 480 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base 481 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base) 482 xalloc_die (); 483 alloc_lines = 2 * alloc_lines - linbuf_base; 484 linbuf += linbuf_base; 485 linbuf = xrealloc (linbuf, 486 (alloc_lines - linbuf_base) * sizeof *linbuf); 487 linbuf -= linbuf_base; 488 } 489 linbuf[line] = p; 490 491 if (p == bufend) 492 break; 493 494 if (context <= i && no_diff_means_no_output) 495 break; 496 497 line++; 498 499 while (*p++ != '\n') 500 continue; 501 } 502 503 /* Done with cache in local variables. */ 504 current->linbuf = linbuf; 505 current->valid_lines = line; 506 current->alloc_lines = alloc_lines; 507 current->equivs = cureqs; 508 equivs = eqs; 509 equivs_alloc = eqs_alloc; 510 equivs_index = eqs_index; 511 } 512 513 /* Prepare the text. Make sure the text end is initialized. 514 Make sure text ends in a newline, 515 but remember that we had to add one. 516 Strip trailing CRs, if that was requested. */ 517 518 static void 519 prepare_text (struct file_data *current) 520 { 521 size_t buffered = current->buffered; 522 char *p = FILE_BUFFER (current); 523 char *dst; 524 525 if (buffered == 0 || p[buffered - 1] == '\n') 526 current->missing_newline = false; 527 else 528 { 529 p[buffered++] = '\n'; 530 current->missing_newline = true; 531 } 532 533 if (!p) 534 return; 535 536 /* Don't use uninitialized storage when planting or using sentinels. */ 537 memset (p + buffered, 0, sizeof (word)); 538 539 if (strip_trailing_cr && (dst = memchr (p, '\r', buffered))) 540 { 541 char const *src = dst; 542 char const *srclim = p + buffered; 543 544 do 545 dst += ! ((*dst = *src++) == '\r' && *src == '\n'); 546 while (src < srclim); 547 548 buffered -= src - dst; 549 } 550 551 current->buffered = buffered; 552 } 553 554 /* We have found N lines in a buffer of size S; guess the 555 proportionate number of lines that will be found in a buffer of 556 size T. However, do not guess a number of lines so large that the 557 resulting line table might cause overflow in size calculations. */ 558 static lin 559 guess_lines (lin n, size_t s, size_t t) 560 { 561 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1); 562 lin guessed_lines = MAX (1, t / guessed_bytes_per_line); 563 return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5; 564 } 565 566 /* Given a vector of two file_data objects, find the identical 567 prefixes and suffixes of each object. */ 568 569 static void 570 find_identical_ends (struct file_data filevec[]) 571 { 572 word *w0, *w1; 573 char *p0, *p1, *buffer0, *buffer1; 574 char const *end0, *beg0; 575 char const **linbuf0, **linbuf1; 576 lin i, lines; 577 size_t n0, n1; 578 lin alloc_lines0, alloc_lines1; 579 lin buffered_prefix, prefix_count, prefix_mask; 580 lin middle_guess, suffix_guess; 581 582 slurp (&filevec[0]); 583 prepare_text (&filevec[0]); 584 if (filevec[0].desc != filevec[1].desc) 585 { 586 slurp (&filevec[1]); 587 prepare_text (&filevec[1]); 588 } 589 else 590 { 591 filevec[1].buffer = filevec[0].buffer; 592 filevec[1].bufsize = filevec[0].bufsize; 593 filevec[1].buffered = filevec[0].buffered; 594 filevec[1].missing_newline = filevec[0].missing_newline; 595 } 596 597 /* Find identical prefix. */ 598 599 w0 = filevec[0].buffer; 600 w1 = filevec[1].buffer; 601 p0 = buffer0 = (char *) w0; 602 p1 = buffer1 = (char *) w1; 603 n0 = filevec[0].buffered; 604 n1 = filevec[1].buffered; 605 606 if (p0 == p1) 607 /* The buffers are the same; sentinels won't work. */ 608 p0 = p1 += n1; 609 else 610 { 611 /* Insert end sentinels, in this case characters that are guaranteed 612 to make the equality test false, and thus terminate the loop. */ 613 614 if (n0 < n1) 615 p0[n0] = ~p1[n0]; 616 else 617 p1[n1] = ~p0[n1]; 618 619 /* Loop until first mismatch, or to the sentinel characters. */ 620 621 /* Compare a word at a time for speed. */ 622 while (*w0 == *w1) 623 w0++, w1++; 624 625 /* Do the last few bytes of comparison a byte at a time. */ 626 p0 = (char *) w0; 627 p1 = (char *) w1; 628 while (*p0 == *p1) 629 p0++, p1++; 630 631 /* Don't mistakenly count missing newline as part of prefix. */ 632 if (ROBUST_OUTPUT_STYLE (output_style) 633 && ((buffer0 + n0 - filevec[0].missing_newline < p0) 634 != 635 (buffer1 + n1 - filevec[1].missing_newline < p1))) 636 p0--, p1--; 637 } 638 639 /* Now P0 and P1 point at the first nonmatching characters. */ 640 641 /* Skip back to last line-beginning in the prefix, 642 and then discard up to HORIZON_LINES lines from the prefix. */ 643 i = horizon_lines; 644 while (p0 != buffer0 && (p0[-1] != '\n' || i--)) 645 p0--, p1--; 646 647 /* Record the prefix. */ 648 filevec[0].prefix_end = p0; 649 filevec[1].prefix_end = p1; 650 651 /* Find identical suffix. */ 652 653 /* P0 and P1 point beyond the last chars not yet compared. */ 654 p0 = buffer0 + n0; 655 p1 = buffer1 + n1; 656 657 if (! ROBUST_OUTPUT_STYLE (output_style) 658 || filevec[0].missing_newline == filevec[1].missing_newline) 659 { 660 end0 = p0; /* Addr of last char in file 0. */ 661 662 /* Get value of P0 at which we should stop scanning backward: 663 this is when either P0 or P1 points just past the last char 664 of the identical prefix. */ 665 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1); 666 667 /* Scan back until chars don't match or we reach that point. */ 668 for (; p0 != beg0; p0--, p1--) 669 if (*p0 != *p1) 670 { 671 /* Point at the first char of the matching suffix. */ 672 beg0 = p0; 673 break; 674 } 675 676 /* Are we at a line-beginning in both files? If not, add the rest of 677 this line to the main body. Discard up to HORIZON_LINES lines from 678 the identical suffix. Also, discard one extra line, 679 because shift_boundaries may need it. */ 680 i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n') 681 && 682 (buffer1 == p1 || p1[-1] == '\n')); 683 while (i-- && p0 != end0) 684 while (*p0++ != '\n') 685 continue; 686 687 p1 += p0 - beg0; 688 } 689 690 /* Record the suffix. */ 691 filevec[0].suffix_begin = p0; 692 filevec[1].suffix_begin = p1; 693 694 /* Calculate number of lines of prefix to save. 695 696 prefix_count == 0 means save the whole prefix; 697 we need this for options like -D that output the whole file, 698 or for enormous contexts (to avoid worrying about arithmetic overflow). 699 We also need it for options like -F that output some preceding line; 700 at least we will need to find the last few lines, 701 but since we don't know how many, it's easiest to find them all. 702 703 Otherwise, prefix_count != 0. Save just prefix_count lines at start 704 of the line buffer; they'll be moved to the proper location later. 705 Handle 1 more line than the context says (because we count 1 too many), 706 rounded up to the next power of 2 to speed index computation. */ 707 708 if (no_diff_means_no_output && ! function_regexp.fastmap 709 && context < LIN_MAX / 4 && context < n0) 710 { 711 middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end); 712 suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0); 713 for (prefix_count = 1; prefix_count <= context; prefix_count *= 2) 714 continue; 715 alloc_lines0 = (prefix_count + middle_guess 716 + MIN (context, suffix_guess)); 717 } 718 else 719 { 720 prefix_count = 0; 721 alloc_lines0 = guess_lines (0, 0, n0); 722 } 723 724 prefix_mask = prefix_count - 1; 725 lines = 0; 726 linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0); 727 p0 = buffer0; 728 729 /* If the prefix is needed, find the prefix lines. */ 730 if (! (no_diff_means_no_output 731 && filevec[0].prefix_end == p0 732 && filevec[1].prefix_end == p1)) 733 { 734 end0 = filevec[0].prefix_end; 735 while (p0 != end0) 736 { 737 lin l = lines++ & prefix_mask; 738 if (l == alloc_lines0) 739 { 740 if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0) 741 xalloc_die (); 742 alloc_lines0 *= 2; 743 linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0); 744 } 745 linbuf0[l] = p0; 746 while (*p0++ != '\n') 747 continue; 748 } 749 } 750 buffered_prefix = prefix_count && context < lines ? context : lines; 751 752 /* Allocate line buffer 1. */ 753 754 middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end); 755 suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1); 756 alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess); 757 if (alloc_lines1 < buffered_prefix 758 || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1) 759 xalloc_die (); 760 linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1); 761 762 if (buffered_prefix != lines) 763 { 764 /* Rotate prefix lines to proper location. */ 765 for (i = 0; i < buffered_prefix; i++) 766 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask]; 767 for (i = 0; i < buffered_prefix; i++) 768 linbuf0[i] = linbuf1[i]; 769 } 770 771 /* Initialize line buffer 1 from line buffer 0. */ 772 for (i = 0; i < buffered_prefix; i++) 773 linbuf1[i] = linbuf0[i] - buffer0 + buffer1; 774 775 /* Record the line buffer, adjusted so that 776 linbuf[0] points at the first differing line. */ 777 filevec[0].linbuf = linbuf0 + buffered_prefix; 778 filevec[1].linbuf = linbuf1 + buffered_prefix; 779 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix; 780 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix; 781 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix; 782 filevec[0].prefix_lines = filevec[1].prefix_lines = lines; 783 } 784 785 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less 786 than 2**k. This table is derived from Chris K. Caldwell's list 787 <http://www.utm.edu/research/primes/lists/2small/>. */ 788 789 static unsigned char const prime_offset[] = 790 { 791 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3, 792 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21, 793 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27, 794 55, 93, 1, 57, 25 795 }; 796 797 /* Verify that this host's size_t is not too wide for the above table. */ 798 799 verify (enough_prime_offsets, 800 sizeof (size_t) * CHAR_BIT <= sizeof prime_offset); 801 802 /* Given a vector of two file_data objects, read the file associated 803 with each one, and build the table of equivalence classes. 804 Return nonzero if either file appears to be a binary file. 805 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */ 806 807 bool 808 read_files (struct file_data filevec[], bool pretend_binary) 809 { 810 int i; 811 bool skip_test = text | pretend_binary; 812 bool appears_binary = pretend_binary | sip (&filevec[0], skip_test); 813 814 if (filevec[0].desc != filevec[1].desc) 815 appears_binary |= sip (&filevec[1], skip_test | appears_binary); 816 else 817 { 818 filevec[1].buffer = filevec[0].buffer; 819 filevec[1].bufsize = filevec[0].bufsize; 820 filevec[1].buffered = filevec[0].buffered; 821 } 822 if (appears_binary) 823 { 824 set_binary_mode (filevec[0].desc, true); 825 set_binary_mode (filevec[1].desc, true); 826 return true; 827 } 828 829 find_identical_ends (filevec); 830 831 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1; 832 if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc) 833 xalloc_die (); 834 equivs = xmalloc (equivs_alloc * sizeof *equivs); 835 /* Equivalence class 0 is permanently safe for lines that were not 836 hashed. Real equivalence classes start at 1. */ 837 equivs_index = 1; 838 839 /* Allocate (one plus) a prime number of hash buckets. Use a prime 840 number between 1/3 and 2/3 of the value of equiv_allocs, 841 approximately. */ 842 for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++) 843 continue; 844 nbuckets = ((size_t) 1 << i) - prime_offset[i]; 845 if (PTRDIFF_MAX / sizeof *buckets <= nbuckets) 846 xalloc_die (); 847 buckets = zalloc ((nbuckets + 1) * sizeof *buckets); 848 buckets++; 849 850 for (i = 0; i < 2; i++) 851 find_and_hash_each_line (&filevec[i]); 852 853 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index; 854 855 free (equivs); 856 free (buckets - 1); 857 858 return false; 859 } 860