1 /* Implementation of the Patience Diff algorithm invented by Bram Cohen: 2 * Divide a diff problem into smaller chunks by an LCS (Longest Common Sequence) 3 * of common-unique lines. */ 4 /* 5 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include <assert.h> 21 #include <errno.h> 22 #include <stdbool.h> 23 #include <stdint.h> 24 #include <stdio.h> 25 #include <stdlib.h> 26 27 #include <arraylist.h> 28 #include <diff_main.h> 29 30 #include "diff_internal.h" 31 #include "diff_debug.h" 32 33 /* Algorithm to find unique lines: 34 * 0: stupidly iterate atoms 35 * 1: qsort 36 * 2: mergesort 37 */ 38 #define UNIQUE_STRATEGY 1 39 40 /* Per-atom state for the Patience Diff algorithm */ 41 struct atom_patience { 42 #if UNIQUE_STRATEGY == 0 43 bool unique_here; 44 #endif 45 bool unique_in_both; 46 struct diff_atom *pos_in_other; 47 struct diff_atom *prev_stack; 48 struct diff_range identical_lines; 49 }; 50 51 /* A diff_atom has a backpointer to the root diff_data. That points to the 52 * current diff_data, a possibly smaller section of the root. That current 53 * diff_data->algo_data is a pointer to an array of struct atom_patience. The 54 * atom's index in current diff_data gives the index in the atom_patience array. 55 */ 56 #define PATIENCE(ATOM) \ 57 (((struct atom_patience*)((ATOM)->root->current->algo_data))\ 58 [diff_atom_idx((ATOM)->root->current, ATOM)]) 59 60 #if UNIQUE_STRATEGY == 0 61 62 /* Stupid iteration and comparison of all atoms */ 63 static int 64 diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count) 65 { 66 struct diff_atom *i; 67 unsigned int count = 0; 68 diff_data_foreach_atom(i, d) { 69 PATIENCE(i).unique_here = true; 70 PATIENCE(i).unique_in_both = true; 71 count++; 72 } 73 diff_data_foreach_atom(i, d) { 74 struct diff_atom *j; 75 76 if (!PATIENCE(i).unique_here) 77 continue; 78 79 diff_data_foreach_atom_from(i + 1, j, d) { 80 bool same; 81 int r = diff_atom_same(&same, i, j); 82 if (r) 83 return r; 84 if (!same) 85 continue; 86 if (PATIENCE(i).unique_here) { 87 PATIENCE(i).unique_here = false; 88 PATIENCE(i).unique_in_both = false; 89 count--; 90 } 91 PATIENCE(j).unique_here = false; 92 PATIENCE(j).unique_in_both = false; 93 count--; 94 } 95 } 96 if (unique_count) 97 *unique_count = count; 98 return 0; 99 } 100 101 /* Mark those lines as PATIENCE(atom).unique_in_both = true that appear exactly 102 * once in each side. */ 103 static int 104 diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right, 105 unsigned int *unique_in_both_count) 106 { 107 /* Derive the final unique_in_both count without needing an explicit 108 * iteration. So this is just some optimiziation to save one iteration 109 * in the end. */ 110 unsigned int unique_in_both; 111 int r; 112 113 r = diff_atoms_mark_unique(left, &unique_in_both); 114 if (r) 115 return r; 116 r = diff_atoms_mark_unique(right, NULL); 117 if (r) 118 return r; 119 120 debug("unique_in_both %u\n", unique_in_both); 121 122 struct diff_atom *i; 123 diff_data_foreach_atom(i, left) { 124 if (!PATIENCE(i).unique_here) 125 continue; 126 struct diff_atom *j; 127 int found_in_b = 0; 128 diff_data_foreach_atom(j, right) { 129 bool same; 130 int r = diff_atom_same(&same, i, j); 131 if (r) 132 return r; 133 if (!same) 134 continue; 135 if (!PATIENCE(j).unique_here) { 136 found_in_b = 2; /* or more */ 137 break; 138 } else { 139 found_in_b = 1; 140 PATIENCE(j).pos_in_other = i; 141 PATIENCE(i).pos_in_other = j; 142 } 143 } 144 145 if (found_in_b == 0 || found_in_b > 1) { 146 PATIENCE(i).unique_in_both = false; 147 unique_in_both--; 148 debug("unique_in_both %u (%d) ", unique_in_both, 149 found_in_b); 150 debug_dump_atom(left, NULL, i); 151 } 152 } 153 154 /* Still need to unmark right[*]->patience.unique_in_both for atoms that 155 * don't exist in left */ 156 diff_data_foreach_atom(i, right) { 157 if (!PATIENCE(i).unique_here 158 || !PATIENCE(i).unique_in_both) 159 continue; 160 struct diff_atom *j; 161 bool found_in_a = false; 162 diff_data_foreach_atom(j, left) { 163 bool same; 164 int r; 165 if (!PATIENCE(j).unique_in_both) 166 continue; 167 r = diff_atom_same(&same, i, j); 168 if (r) 169 return r; 170 if (!same) 171 continue; 172 found_in_a = true; 173 break; 174 } 175 176 if (!found_in_a) 177 PATIENCE(i).unique_in_both = false; 178 } 179 180 if (unique_in_both_count) 181 *unique_in_both_count = unique_in_both; 182 return 0; 183 } 184 185 #else /* UNIQUE_STRATEGY != 0 */ 186 187 /* Use an optimized sorting algorithm (qsort, mergesort) to find unique lines */ 188 189 static int diff_atoms_compar(const void *_a, const void *_b) 190 { 191 const struct diff_atom *a = *(struct diff_atom**)_a; 192 const struct diff_atom *b = *(struct diff_atom**)_b; 193 int cmp; 194 int rc = 0; 195 196 /* If there's been an error (e.g. I/O error) in a previous compar, we 197 * have no way to abort the sort but just report the rc and stop 198 * comparing. Make sure to catch errors on either side. If atoms are 199 * from more than one diff_data, make sure the error, if any, spreads 200 * to all of them, so we can cut short all future comparisons. */ 201 if (a->root->err) 202 rc = a->root->err; 203 if (b->root->err) 204 rc = b->root->err; 205 if (rc) { 206 a->root->err = rc; 207 b->root->err = rc; 208 /* just return 'equal' to not swap more positions */ 209 return 0; 210 } 211 212 /* Sort by the simplistic hash */ 213 if (a->hash < b->hash) 214 return -1; 215 if (a->hash > b->hash) 216 return 1; 217 218 /* If hashes are the same, the lines may still differ. Do a full cmp. */ 219 rc = diff_atom_cmp(&cmp, a, b); 220 221 if (rc) { 222 /* Mark the I/O error so that the caller can find out about it. 223 * For the case atoms are from more than one diff_data, mark in 224 * both. */ 225 a->root->err = rc; 226 if (a->root != b->root) 227 b->root->err = rc; 228 return 0; 229 } 230 231 return cmp; 232 } 233 234 /* Sort an array of struct diff_atom* in-place. */ 235 static int diff_atoms_sort(struct diff_atom *atoms[], 236 size_t atoms_count) 237 { 238 #if UNIQUE_STRATEGY == 1 239 qsort(atoms, atoms_count, sizeof(struct diff_atom*), diff_atoms_compar); 240 #else 241 mergesort(atoms, atoms_count, sizeof(struct diff_atom*), 242 diff_atoms_compar); 243 #endif 244 return atoms[0]->root->err; 245 } 246 247 static int 248 diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right, 249 unsigned int *unique_in_both_count_p) 250 { 251 struct diff_atom *a; 252 struct diff_atom *b; 253 struct diff_atom **all_atoms; 254 unsigned int len = 0; 255 unsigned int i; 256 unsigned int unique_in_both_count = 0; 257 int rc; 258 259 all_atoms = calloc(left->atoms.len + right->atoms.len, 260 sizeof(struct diff_atom *)); 261 if (all_atoms == NULL) 262 return ENOMEM; 263 264 left->err = 0; 265 right->err = 0; 266 left->root->err = 0; 267 right->root->err = 0; 268 diff_data_foreach_atom(a, left) { 269 all_atoms[len++] = a; 270 } 271 diff_data_foreach_atom(b, right) { 272 all_atoms[len++] = b; 273 } 274 275 rc = diff_atoms_sort(all_atoms, len); 276 if (rc) 277 goto free_and_exit; 278 279 /* Now we have a sorted array of atom pointers. All similar lines are 280 * adjacent. Walk through the array and mark those that are unique on 281 * each side, but exist once in both sources. */ 282 for (i = 0; i < len; i++) { 283 bool same; 284 unsigned int next_differing_i; 285 unsigned int last_identical_i; 286 unsigned int j; 287 unsigned int count_first_side = 1; 288 unsigned int count_other_side = 0; 289 a = all_atoms[i]; 290 debug("a: "); 291 debug_dump_atom(a->root, NULL, a); 292 293 /* Do as few diff_atom_cmp() as possible: first walk forward 294 * only using the cheap hash as indicator for differing atoms; 295 * then walk backwards until hitting an identical atom. */ 296 for (next_differing_i = i + 1; next_differing_i < len; 297 next_differing_i++) { 298 b = all_atoms[next_differing_i]; 299 if (a->hash != b->hash) 300 break; 301 } 302 for (last_identical_i = next_differing_i - 1; 303 last_identical_i > i; 304 last_identical_i--) { 305 b = all_atoms[last_identical_i]; 306 rc = diff_atom_same(&same, a, b); 307 if (rc) 308 goto free_and_exit; 309 if (same) 310 break; 311 } 312 next_differing_i = last_identical_i + 1; 313 314 for (j = i+1; j < next_differing_i; j++) { 315 b = all_atoms[j]; 316 /* A following atom is the same. See on which side the 317 * repetition counts. */ 318 if (a->root == b->root) 319 count_first_side ++; 320 else 321 count_other_side ++; 322 debug("b: "); 323 debug_dump_atom(b->root, NULL, b); 324 debug(" count_first_side=%d count_other_side=%d\n", 325 count_first_side, count_other_side); 326 } 327 328 /* Counted a section of similar atoms, put the results back to 329 * the atoms. */ 330 if ((count_first_side == 1) 331 && (count_other_side == 1)) { 332 b = all_atoms[i+1]; 333 PATIENCE(a).unique_in_both = true; 334 PATIENCE(a).pos_in_other = b; 335 PATIENCE(b).unique_in_both = true; 336 PATIENCE(b).pos_in_other = a; 337 unique_in_both_count++; 338 } 339 340 /* j now points at the first atom after 'a' that is not 341 * identical to 'a'. j is always > i. */ 342 i = j - 1; 343 } 344 *unique_in_both_count_p = unique_in_both_count; 345 rc = 0; 346 free_and_exit: 347 free(all_atoms); 348 return rc; 349 } 350 #endif /* UNIQUE_STRATEGY != 0 */ 351 352 /* binary search to find the stack to put this atom "card" on. */ 353 static int 354 find_target_stack(struct diff_atom *atom, 355 struct diff_atom **patience_stacks, 356 unsigned int patience_stacks_count) 357 { 358 unsigned int lo = 0; 359 unsigned int hi = patience_stacks_count; 360 while (lo < hi) { 361 unsigned int mid = (lo + hi) >> 1; 362 363 if (PATIENCE(patience_stacks[mid]).pos_in_other 364 < PATIENCE(atom).pos_in_other) 365 lo = mid + 1; 366 else 367 hi = mid; 368 } 369 return lo; 370 } 371 372 /* Among the lines that appear exactly once in each side, find the longest 373 * streak that appear in both files in the same order (with other stuff allowed 374 * to interleave). Use patience sort for that, as in the Patience Diff 375 * algorithm. 376 * See https://bramcohen.livejournal.com/73318.html and, for a much more 377 * detailed explanation, 378 * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */ 379 int 380 diff_algo_patience(const struct diff_algo_config *algo_config, 381 struct diff_state *state) 382 { 383 int rc; 384 struct diff_data *left = &state->left; 385 struct diff_data *right = &state->right; 386 struct atom_patience *atom_patience_left = 387 calloc(left->atoms.len, sizeof(struct atom_patience)); 388 struct atom_patience *atom_patience_right = 389 calloc(right->atoms.len, sizeof(struct atom_patience)); 390 unsigned int unique_in_both_count; 391 struct diff_atom **lcs = NULL; 392 393 debug("\n** %s\n", __func__); 394 395 left->root->current = left; 396 right->root->current = right; 397 left->algo_data = atom_patience_left; 398 right->algo_data = atom_patience_right; 399 400 /* Find those lines that appear exactly once in 'left' and exactly once 401 * in 'right'. */ 402 rc = diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count); 403 if (rc) 404 goto free_and_exit; 405 406 debug("unique_in_both_count %u\n", unique_in_both_count); 407 debug("left:\n"); 408 debug_dump(left); 409 debug("right:\n"); 410 debug_dump(right); 411 412 if (!unique_in_both_count) { 413 /* Cannot apply Patience, tell the caller to use fallback_algo 414 * instead. */ 415 rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK; 416 goto free_and_exit; 417 } 418 419 rc = ENOMEM; 420 421 /* An array of Longest Common Sequence is the result of the below 422 * subscope: */ 423 unsigned int lcs_count = 0; 424 struct diff_atom *lcs_tail = NULL; 425 426 { 427 /* This subscope marks the lifetime of the atom_pointers 428 * allocation */ 429 430 /* One chunk of storage for atom pointers */ 431 struct diff_atom **atom_pointers; 432 atom_pointers = recallocarray(NULL, 0, unique_in_both_count * 2, 433 sizeof(struct diff_atom*)); 434 if (atom_pointers == NULL) 435 return ENOMEM; 436 /* Half for the list of atoms that still need to be put on 437 * stacks */ 438 struct diff_atom **uniques = atom_pointers; 439 440 /* Half for the patience sort state's "card stacks" -- we 441 * remember only each stack's topmost "card" */ 442 struct diff_atom **patience_stacks; 443 patience_stacks = atom_pointers + unique_in_both_count; 444 unsigned int patience_stacks_count = 0; 445 446 /* Take all common, unique items from 'left' ... */ 447 448 struct diff_atom *atom; 449 struct diff_atom **uniques_end = uniques; 450 diff_data_foreach_atom(atom, left) { 451 if (!PATIENCE(atom).unique_in_both) 452 continue; 453 *uniques_end = atom; 454 uniques_end++; 455 } 456 457 /* ...and sort them to the order found in 'right'. 458 * The idea is to find the leftmost stack that has a higher line 459 * number and add it to the stack's top. 460 * If there is no such stack, open a new one on the right. The 461 * line number is derived from the atom*, which are array items 462 * and hence reflect the relative position in the source file. 463 * So we got the common-uniques from 'left' and sort them 464 * according to PATIENCE(atom).pos_in_other. */ 465 unsigned int i; 466 for (i = 0; i < unique_in_both_count; i++) { 467 atom = uniques[i]; 468 unsigned int target_stack; 469 target_stack = find_target_stack(atom, patience_stacks, 470 patience_stacks_count); 471 assert(target_stack <= patience_stacks_count); 472 patience_stacks[target_stack] = atom; 473 if (target_stack == patience_stacks_count) 474 patience_stacks_count++; 475 476 /* Record a back reference to the next stack on the 477 * left, which will form the final longest sequence 478 * later. */ 479 PATIENCE(atom).prev_stack = target_stack ? 480 patience_stacks[target_stack - 1] : NULL; 481 482 { 483 int xx; 484 for (xx = 0; xx < patience_stacks_count; xx++) { 485 debug(" %s%d", 486 (xx == target_stack) ? ">" : "", 487 diff_atom_idx(right, 488 PATIENCE(patience_stacks[xx]).pos_in_other)); 489 } 490 debug("\n"); 491 } 492 } 493 494 /* backtrace through prev_stack references to form the final 495 * longest common sequence */ 496 lcs_tail = patience_stacks[patience_stacks_count - 1]; 497 lcs_count = patience_stacks_count; 498 499 /* uniques and patience_stacks are no longer needed. 500 * Backpointers are in PATIENCE(atom).prev_stack */ 501 free(atom_pointers); 502 } 503 504 lcs = recallocarray(NULL, 0, lcs_count, sizeof(struct diff_atom*)); 505 struct diff_atom **lcs_backtrace_pos = &lcs[lcs_count - 1]; 506 struct diff_atom *atom; 507 for (atom = lcs_tail; atom; atom = PATIENCE(atom).prev_stack, lcs_backtrace_pos--) { 508 assert(lcs_backtrace_pos >= lcs); 509 *lcs_backtrace_pos = atom; 510 } 511 512 unsigned int i; 513 if (DEBUG) { 514 debug("\npatience LCS:\n"); 515 for (i = 0; i < lcs_count; i++) { 516 debug("\n L "); debug_dump_atom(left, right, lcs[i]); 517 debug(" R "); debug_dump_atom(right, left, 518 PATIENCE(lcs[i]).pos_in_other); 519 } 520 } 521 522 523 /* TODO: For each common-unique line found (now listed in lcs), swallow 524 * lines upwards and downwards that are identical on each side. Requires 525 * a way to represent atoms being glued to adjacent atoms. */ 526 527 debug("\ntraverse LCS, possibly recursing:\n"); 528 529 /* Now we have pinned positions in both files at which it makes sense to 530 * divide the diff problem into smaller chunks. Go into the next round: 531 * look at each section in turn, trying to again find common-unique 532 * lines in those smaller sections. As soon as no more are found, the 533 * remaining smaller sections are solved by Myers. */ 534 /* left_pos and right_pos are indexes in left/right->atoms.head until 535 * which the atoms are already handled (added to result chunks). */ 536 unsigned int left_pos = 0; 537 unsigned int right_pos = 0; 538 for (i = 0; i <= lcs_count; i++) { 539 struct diff_atom *atom; 540 struct diff_atom *atom_r; 541 /* left_idx and right_idx are indexes of the start of this 542 * section of identical lines on both sides. 543 * left_pos marks the index of the first still unhandled line, 544 * left_idx is the start of an identical section some way 545 * further down, and this loop adds an unsolved chunk of 546 * [left_pos..left_idx[ and a solved chunk of 547 * [left_idx..identical_lines.end[. */ 548 unsigned int left_idx; 549 unsigned int right_idx; 550 551 debug("iteration %u of %u left_pos %u right_pos %u\n", 552 i, lcs_count, left_pos, right_pos); 553 554 if (i < lcs_count) { 555 atom = lcs[i]; 556 atom_r = PATIENCE(atom).pos_in_other; 557 debug("lcs[%u] = left[%u] = right[%u]\n", i, 558 diff_atom_idx(left, atom), diff_atom_idx(right, atom_r)); 559 left_idx = diff_atom_idx(left, atom); 560 right_idx = diff_atom_idx(right, atom_r); 561 } else { 562 /* There are no more identical lines until the end of 563 * left and right. */ 564 atom = NULL; 565 atom_r = NULL; 566 left_idx = left->atoms.len; 567 right_idx = right->atoms.len; 568 } 569 570 /* 'atom' (if not NULL) now marks an atom that matches on both 571 * sides according to patience-diff (a common-unique identical 572 * atom in both files). 573 * Handle the section before and the atom itself; the section 574 * after will be handled by the next loop iteration -- note that 575 * i loops to last element + 1 ("i <= lcs_count"), so that there 576 * will be another final iteration to pick up the last remaining 577 * items after the last LCS atom. 578 */ 579 580 debug("iteration %u left_pos %u left_idx %u" 581 " right_pos %u right_idx %u\n", 582 i, left_pos, left_idx, right_pos, right_idx); 583 584 /* Section before the matching atom */ 585 struct diff_atom *left_atom = &left->atoms.head[left_pos]; 586 unsigned int left_section_len = left_idx - left_pos; 587 588 struct diff_atom *right_atom = &(right->atoms.head[right_pos]); 589 unsigned int right_section_len = right_idx - right_pos; 590 591 if (left_section_len && right_section_len) { 592 /* Record an unsolved chunk, the caller will apply 593 * inner_algo() on this chunk. */ 594 if (!diff_state_add_chunk(state, false, 595 left_atom, left_section_len, 596 right_atom, 597 right_section_len)) 598 goto free_and_exit; 599 } else if (left_section_len && !right_section_len) { 600 /* Only left atoms and none on the right, they form a 601 * "minus" chunk, then. */ 602 if (!diff_state_add_chunk(state, true, 603 left_atom, left_section_len, 604 right_atom, 0)) 605 goto free_and_exit; 606 } else if (!left_section_len && right_section_len) { 607 /* No left atoms, only atoms on the right, they form a 608 * "plus" chunk, then. */ 609 if (!diff_state_add_chunk(state, true, 610 left_atom, 0, 611 right_atom, right_section_len)) 612 goto free_and_exit; 613 } 614 /* else: left_section_len == 0 and right_section_len == 0, i.e. 615 * nothing here. */ 616 617 /* The atom found to match on both sides forms a chunk of equals 618 * on each side. In the very last iteration of this loop, there 619 * is no matching atom, we were just cleaning out the remaining 620 * lines. */ 621 if (atom) { 622 void *ok; 623 ok = diff_state_add_chunk(state, true, 624 atom, 1, 625 PATIENCE(atom).pos_in_other, 1); 626 if (!ok) 627 goto free_and_exit; 628 } 629 left_pos = left_idx + 1; 630 right_pos = right_idx + 1; 631 debug("end of iteration %u left_pos %u left_idx %u" 632 " right_pos %u right_idx %u\n", 633 i, left_pos, left_idx, right_pos, right_idx); 634 } 635 debug("** END %s\n", __func__); 636 637 rc = DIFF_RC_OK; 638 639 free_and_exit: 640 left->root->current = NULL; 641 right->root->current = NULL; 642 free(atom_patience_left); 643 free(atom_patience_right); 644 if (lcs) 645 free(lcs); 646 return rc; 647 } 648