1 /* Generic infrastructure to implement various diff algorithms (implementation). */ 2 /* 3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 19 #include <sys/queue.h> 20 #include <ctype.h> 21 #include <errno.h> 22 #include <stdint.h> 23 #include <stdlib.h> 24 #include <stdbool.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <limits.h> 28 #include <unistd.h> 29 30 #include <assert.h> 31 32 #include <arraylist.h> 33 #include <diff_main.h> 34 35 #include "diff_internal.h" 36 #include "diff_debug.h" 37 38 inline enum diff_chunk_type 39 diff_chunk_type(const struct diff_chunk *chunk) 40 { 41 if (!chunk->left_count && !chunk->right_count) 42 return CHUNK_EMPTY; 43 if (!chunk->solved) 44 return CHUNK_ERROR; 45 if (!chunk->right_count) 46 return CHUNK_MINUS; 47 if (!chunk->left_count) 48 return CHUNK_PLUS; 49 if (chunk->left_count != chunk->right_count) 50 return CHUNK_ERROR; 51 return CHUNK_SAME; 52 } 53 54 static int 55 read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len) 56 { 57 ssize_t r; 58 59 if (fseeko(f, at_pos, SEEK_SET) == -1) 60 return errno; 61 r = fread(buf, sizeof(char), len, f); 62 if ((r == 0 || r < len) && ferror(f)) 63 return EIO; 64 if (r != len) 65 return EIO; 66 return 0; 67 } 68 69 static int 70 buf_cmp(const unsigned char *left, size_t left_len, 71 const unsigned char *right, size_t right_len, 72 bool ignore_whitespace) 73 { 74 int cmp; 75 76 if (ignore_whitespace) { 77 int il = 0, ir = 0; 78 while (il < left_len && ir < right_len) { 79 unsigned char cl = left[il]; 80 unsigned char cr = right[ir]; 81 82 if (isspace((unsigned char)cl) && il < left_len) { 83 il++; 84 continue; 85 } 86 if (isspace((unsigned char)cr) && ir < right_len) { 87 ir++; 88 continue; 89 } 90 91 if (cl > cr) 92 return 1; 93 if (cr > cl) 94 return -1; 95 il++; 96 ir++; 97 } 98 while (il < left_len) { 99 unsigned char cl = left[il++]; 100 if (!isspace((unsigned char)cl)) 101 return 1; 102 } 103 while (ir < right_len) { 104 unsigned char cr = right[ir++]; 105 if (!isspace((unsigned char)cr)) 106 return -1; 107 } 108 109 return 0; 110 } 111 112 cmp = memcmp(left, right, MIN(left_len, right_len)); 113 if (cmp) 114 return cmp; 115 if (left_len == right_len) 116 return 0; 117 return (left_len > right_len) ? 1 : -1; 118 } 119 120 int 121 diff_atom_cmp(int *cmp, 122 const struct diff_atom *left, 123 const struct diff_atom *right) 124 { 125 off_t remain_left, remain_right; 126 int flags = (left->root->diff_flags | right->root->diff_flags); 127 bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE); 128 129 if (!left->len && !right->len) { 130 *cmp = 0; 131 return 0; 132 } 133 if (!ignore_whitespace) { 134 if (!right->len) { 135 *cmp = 1; 136 return 0; 137 } 138 if (!left->len) { 139 *cmp = -1; 140 return 0; 141 } 142 } 143 144 if (left->at != NULL && right->at != NULL) { 145 *cmp = buf_cmp(left->at, left->len, right->at, right->len, 146 ignore_whitespace); 147 return 0; 148 } 149 150 remain_left = left->len; 151 remain_right = right->len; 152 while (remain_left > 0 || remain_right > 0) { 153 const size_t chunksz = 8192; 154 unsigned char buf_left[chunksz], buf_right[chunksz]; 155 const uint8_t *p_left, *p_right; 156 off_t n_left, n_right; 157 int r; 158 159 if (!remain_right) { 160 *cmp = 1; 161 return 0; 162 } 163 if (!remain_left) { 164 *cmp = -1; 165 return 0; 166 } 167 168 n_left = MIN(chunksz, remain_left); 169 n_right = MIN(chunksz, remain_right); 170 171 if (left->at == NULL) { 172 r = read_at(left->root->f, 173 left->pos + (left->len - remain_left), 174 buf_left, n_left); 175 if (r) { 176 *cmp = 0; 177 return r; 178 } 179 p_left = buf_left; 180 } else { 181 p_left = left->at + (left->len - remain_left); 182 } 183 184 if (right->at == NULL) { 185 r = read_at(right->root->f, 186 right->pos + (right->len - remain_right), 187 buf_right, n_right); 188 if (r) { 189 *cmp = 0; 190 return r; 191 } 192 p_right = buf_right; 193 } else { 194 p_right = right->at + (right->len - remain_right); 195 } 196 197 r = buf_cmp(p_left, n_left, p_right, n_right, 198 ignore_whitespace); 199 if (r) { 200 *cmp = r; 201 return 0; 202 } 203 204 remain_left -= n_left; 205 remain_right -= n_right; 206 } 207 208 *cmp = 0; 209 return 0; 210 } 211 212 int 213 diff_atom_same(bool *same, 214 const struct diff_atom *left, 215 const struct diff_atom *right) 216 { 217 int cmp; 218 int r; 219 if (left->hash != right->hash) { 220 *same = false; 221 return 0; 222 } 223 r = diff_atom_cmp(&cmp, left, right); 224 if (r) { 225 *same = true; 226 return r; 227 } 228 *same = (cmp == 0); 229 return 0; 230 } 231 232 static struct diff_chunk * 233 diff_state_add_solved_chunk(struct diff_state *state, 234 const struct diff_chunk *chunk) 235 { 236 diff_chunk_arraylist_t *result; 237 struct diff_chunk *new_chunk; 238 enum diff_chunk_type last_t; 239 enum diff_chunk_type new_t; 240 struct diff_chunk *last; 241 242 /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk 243 * never directly follows a plus chunk. */ 244 result = &state->result->chunks; 245 246 last_t = result->len ? diff_chunk_type(&result->head[result->len - 1]) 247 : CHUNK_EMPTY; 248 new_t = diff_chunk_type(chunk); 249 250 debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED", 251 result->len); 252 debug("L\n"); 253 debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count); 254 debug("R\n"); 255 debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count); 256 257 if (result->len) { 258 last = &result->head[result->len - 1]; 259 assert(chunk->left_start 260 == last->left_start + last->left_count); 261 assert(chunk->right_start 262 == last->right_start + last->right_count); 263 } 264 265 if (new_t == last_t) { 266 new_chunk = &result->head[result->len - 1]; 267 new_chunk->left_count += chunk->left_count; 268 new_chunk->right_count += chunk->right_count; 269 debug(" - added chunk touches previous one of same type, joined:\n"); 270 debug("L\n"); 271 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count); 272 debug("R\n"); 273 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count); 274 } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) { 275 enum diff_chunk_type prev_last_t = 276 result->len > 1 ? 277 diff_chunk_type(&result->head[result->len - 2]) 278 : CHUNK_EMPTY; 279 /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk-> 280 * Is the one before that also a minus? combine. */ 281 if (prev_last_t == CHUNK_MINUS) { 282 new_chunk = &result->head[result->len - 2]; 283 new_chunk->left_count += chunk->left_count; 284 new_chunk->right_count += chunk->right_count; 285 286 debug(" - added minus-chunk follows plus-chunk," 287 " put before that plus-chunk and joined" 288 " with preceding minus-chunk:\n"); 289 debug("L\n"); 290 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count); 291 debug("R\n"); 292 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count); 293 } else { 294 ARRAYLIST_INSERT(new_chunk, *result, result->len - 1); 295 if (!new_chunk) 296 return NULL; 297 *new_chunk = *chunk; 298 299 /* The new minus chunk indicates to which position on 300 * the right it corresponds, even though it doesn't add 301 * any lines on the right. By moving above a plus chunk, 302 * that position on the right has shifted. */ 303 last = &result->head[result->len - 1]; 304 new_chunk->right_start = last->right_start; 305 306 debug(" - added minus-chunk follows plus-chunk," 307 " put before that plus-chunk\n"); 308 } 309 310 /* That last_t == CHUNK_PLUS indicates to which position on the 311 * left it corresponds, even though it doesn't add any lines on 312 * the left. By inserting/extending the prev_last_t == 313 * CHUNK_MINUS, that position on the left has shifted. */ 314 last = &result->head[result->len - 1]; 315 last->left_start = new_chunk->left_start 316 + new_chunk->left_count; 317 318 } else { 319 ARRAYLIST_ADD(new_chunk, *result); 320 if (!new_chunk) 321 return NULL; 322 *new_chunk = *chunk; 323 } 324 return new_chunk; 325 } 326 327 /* Even if a left or right side is empty, diff output may need to know the 328 * position in that file. 329 * So left_start or right_start must never be NULL -- pass left_count or 330 * right_count as zero to indicate staying at that position without consuming 331 * any lines. */ 332 struct diff_chunk * 333 diff_state_add_chunk(struct diff_state *state, bool solved, 334 struct diff_atom *left_start, unsigned int left_count, 335 struct diff_atom *right_start, unsigned int right_count) 336 { 337 struct diff_chunk *new_chunk; 338 struct diff_chunk chunk = { 339 .solved = solved, 340 .left_start = left_start, 341 .left_count = left_count, 342 .right_start = right_start, 343 .right_count = right_count, 344 }; 345 346 /* An unsolved chunk means store as intermediate result for later 347 * re-iteration. 348 * If there already are intermediate results, that means even a 349 * following solved chunk needs to go to intermediate results, so that 350 * it is later put in the final correct position in solved chunks. 351 */ 352 if (!solved || state->temp_result.len) { 353 /* Append to temp_result */ 354 debug("ADD %s chunk to temp result:\n", 355 chunk.solved ? "solved" : "UNSOLVED"); 356 debug("L\n"); 357 debug_dump_atoms(&state->left, left_start, left_count); 358 debug("R\n"); 359 debug_dump_atoms(&state->right, right_start, right_count); 360 ARRAYLIST_ADD(new_chunk, state->temp_result); 361 if (!new_chunk) 362 return NULL; 363 *new_chunk = chunk; 364 return new_chunk; 365 } 366 367 return diff_state_add_solved_chunk(state, &chunk); 368 } 369 370 static void 371 diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data, 372 unsigned long long len, int diff_flags) 373 { 374 *d = (struct diff_data){ 375 .f = f, 376 .pos = 0, 377 .data = data, 378 .len = len, 379 .root = d, 380 .diff_flags = diff_flags, 381 }; 382 } 383 384 void 385 diff_data_init_subsection(struct diff_data *d, struct diff_data *parent, 386 struct diff_atom *from_atom, unsigned int atoms_count) 387 { 388 struct diff_atom *last_atom; 389 390 debug("diff_data %p parent %p from_atom %p atoms_count %u\n", 391 d, parent, from_atom, atoms_count); 392 debug(" from_atom "); 393 debug_dump_atom(parent, NULL, from_atom); 394 395 if (atoms_count == 0) { 396 *d = (struct diff_data){ 397 .f = NULL, 398 .pos = 0, 399 .data = NULL, 400 .len = 0, 401 .root = parent->root, 402 .atoms.head = NULL, 403 .atoms.len = atoms_count, 404 }; 405 406 return; 407 } 408 409 last_atom = from_atom + atoms_count - 1; 410 *d = (struct diff_data){ 411 .f = NULL, 412 .pos = from_atom->pos, 413 .data = from_atom->at, 414 .len = (last_atom->pos + last_atom->len) - from_atom->pos, 415 .root = parent->root, 416 .atoms.head = from_atom, 417 .atoms.len = atoms_count, 418 }; 419 420 debug("subsection:\n"); 421 debug_dump(d); 422 } 423 424 void 425 diff_data_free(struct diff_data *diff_data) 426 { 427 if (!diff_data) 428 return; 429 if (diff_data->atoms.allocated) 430 ARRAYLIST_FREE(diff_data->atoms); 431 } 432 433 int 434 diff_algo_none(const struct diff_algo_config *algo_config, 435 struct diff_state *state) 436 { 437 debug("\n** %s\n", __func__); 438 debug("left:\n"); 439 debug_dump(&state->left); 440 debug("right:\n"); 441 debug_dump(&state->right); 442 debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL, 443 0); 444 445 /* Add a chunk of equal lines, if any */ 446 struct diff_atom *l = state->left.atoms.head; 447 unsigned int l_len = state->left.atoms.len; 448 struct diff_atom *r = state->right.atoms.head; 449 unsigned int r_len = state->right.atoms.len; 450 unsigned int equal_atoms_start = 0; 451 unsigned int equal_atoms_end = 0; 452 unsigned int l_idx = 0; 453 unsigned int r_idx = 0; 454 455 while (equal_atoms_start < l_len 456 && equal_atoms_start < r_len) { 457 int err; 458 bool same; 459 err = diff_atom_same(&same, &l[equal_atoms_start], 460 &r[equal_atoms_start]); 461 if (err) 462 return err; 463 if (!same) 464 break; 465 equal_atoms_start++; 466 } 467 while (equal_atoms_end < (l_len - equal_atoms_start) 468 && equal_atoms_end < (r_len - equal_atoms_start)) { 469 int err; 470 bool same; 471 err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end], 472 &r[r_len - 1 - equal_atoms_end]); 473 if (err) 474 return err; 475 if (!same) 476 break; 477 equal_atoms_end++; 478 } 479 480 /* Add a chunk of equal lines at the start */ 481 if (equal_atoms_start) { 482 if (!diff_state_add_chunk(state, true, 483 l, equal_atoms_start, 484 r, equal_atoms_start)) 485 return ENOMEM; 486 l_idx += equal_atoms_start; 487 r_idx += equal_atoms_start; 488 } 489 490 /* Add a "minus" chunk with all lines from the left. */ 491 if (equal_atoms_start + equal_atoms_end < l_len) { 492 unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end; 493 if (!diff_state_add_chunk(state, true, 494 &l[l_idx], add_len, 495 &r[r_idx], 0)) 496 return ENOMEM; 497 l_idx += add_len; 498 } 499 500 /* Add a "plus" chunk with all lines from the right. */ 501 if (equal_atoms_start + equal_atoms_end < r_len) { 502 unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end; 503 if (!diff_state_add_chunk(state, true, 504 &l[l_idx], 0, 505 &r[r_idx], add_len)) 506 return ENOMEM; 507 r_idx += add_len; 508 } 509 510 /* Add a chunk of equal lines at the end */ 511 if (equal_atoms_end) { 512 if (!diff_state_add_chunk(state, true, 513 &l[l_idx], equal_atoms_end, 514 &r[r_idx], equal_atoms_end)) 515 return ENOMEM; 516 } 517 518 return DIFF_RC_OK; 519 } 520 521 static int 522 diff_run_algo(const struct diff_algo_config *algo_config, 523 struct diff_state *state) 524 { 525 int rc; 526 527 if (!algo_config || !algo_config->impl 528 || !state->recursion_depth_left 529 || !state->left.atoms.len || !state->right.atoms.len) { 530 debug("Fall back to diff_algo_none():%s%s%s\n", 531 (!algo_config || !algo_config->impl) ? " no-cfg" : "", 532 (!state->recursion_depth_left) ? " max-depth" : "", 533 (!state->left.atoms.len || !state->right.atoms.len)? 534 " trivial" : ""); 535 return diff_algo_none(algo_config, state); 536 } 537 538 ARRAYLIST_FREE(state->temp_result); 539 ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE); 540 rc = algo_config->impl(algo_config, state); 541 switch (rc) { 542 case DIFF_RC_USE_DIFF_ALGO_FALLBACK: 543 debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", 544 algo_config->fallback_algo); 545 rc = diff_run_algo(algo_config->fallback_algo, state); 546 goto return_rc; 547 548 case DIFF_RC_OK: 549 /* continue below */ 550 break; 551 552 default: 553 /* some error happened */ 554 goto return_rc; 555 } 556 557 /* Pick up any diff chunks that are still unsolved and feed to 558 * inner_algo. inner_algo will solve unsolved chunks and append to 559 * result, and subsequent solved chunks on this level are then appended 560 * to result afterwards. */ 561 int i; 562 for (i = 0; i < state->temp_result.len; i++) { 563 struct diff_chunk *c = &state->temp_result.head[i]; 564 if (c->solved) { 565 diff_state_add_solved_chunk(state, c); 566 continue; 567 } 568 569 /* c is an unsolved chunk, feed to inner_algo */ 570 struct diff_state inner_state = { 571 .result = state->result, 572 .recursion_depth_left = state->recursion_depth_left - 1, 573 .kd_buf = state->kd_buf, 574 .kd_buf_size = state->kd_buf_size, 575 }; 576 diff_data_init_subsection(&inner_state.left, &state->left, 577 c->left_start, c->left_count); 578 diff_data_init_subsection(&inner_state.right, &state->right, 579 c->right_start, c->right_count); 580 581 rc = diff_run_algo(algo_config->inner_algo, &inner_state); 582 state->kd_buf = inner_state.kd_buf; 583 state->kd_buf_size = inner_state.kd_buf_size; 584 if (rc != DIFF_RC_OK) 585 goto return_rc; 586 } 587 588 rc = DIFF_RC_OK; 589 return_rc: 590 ARRAYLIST_FREE(state->temp_result); 591 return rc; 592 } 593 594 int 595 diff_atomize_file(struct diff_data *d, 596 const struct diff_config *config, 597 FILE *f, const uint8_t *data, off_t len, int diff_flags) 598 { 599 if (!config->atomize_func) 600 return EINVAL; 601 602 diff_data_init_root(d, f, data, len, diff_flags); 603 604 return config->atomize_func(config->atomize_func_data, d); 605 606 } 607 608 struct diff_result * 609 diff_main(const struct diff_config *config, struct diff_data *left, 610 struct diff_data *right) 611 { 612 struct diff_result *result = malloc(sizeof(struct diff_result)); 613 if (!result) 614 return NULL; 615 616 *result = (struct diff_result){}; 617 result->left = left; 618 result->right = right; 619 620 struct diff_state state = { 621 .result = result, 622 .recursion_depth_left = config->max_recursion_depth ? 623 config->max_recursion_depth : UINT_MAX, 624 .kd_buf = NULL, 625 .kd_buf_size = 0, 626 }; 627 diff_data_init_subsection(&state.left, left, 628 left->atoms.head, 629 left->atoms.len); 630 diff_data_init_subsection(&state.right, right, 631 right->atoms.head, 632 right->atoms.len); 633 634 result->rc = diff_run_algo(config->algo, &state); 635 free(state.kd_buf); 636 637 return result; 638 } 639 640 void 641 diff_result_free(struct diff_result *result) 642 { 643 if (!result) 644 return; 645 ARRAYLIST_FREE(result->chunks); 646 free(result); 647 } 648 649 int 650 diff_result_contains_printable_chunks(struct diff_result *result) 651 { 652 struct diff_chunk *c; 653 enum diff_chunk_type t; 654 int i; 655 656 for (i = 0; i < result->chunks.len; i++) { 657 c = &result->chunks.head[i]; 658 t = diff_chunk_type(c); 659 if (t == CHUNK_MINUS || t == CHUNK_PLUS) 660 return 1; 661 } 662 663 return 0; 664 } 665