1 /* Generic infrastructure to implement various diff algorithms (implementation). */ 2 /* 3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 19 #include <sys/queue.h> 20 #include <ctype.h> 21 #include <errno.h> 22 #include <stdint.h> 23 #include <stdlib.h> 24 #include <stdbool.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <limits.h> 28 #include <unistd.h> 29 30 #include <assert.h> 31 32 #include <arraylist.h> 33 #include <diff_main.h> 34 35 #include "diff_internal.h" 36 #include "diff_debug.h" 37 38 inline enum diff_chunk_type 39 diff_chunk_type(const struct diff_chunk *chunk) 40 { 41 if (!chunk->left_count && !chunk->right_count) 42 return CHUNK_EMPTY; 43 if (!chunk->solved) 44 return CHUNK_ERROR; 45 if (!chunk->right_count) 46 return CHUNK_MINUS; 47 if (!chunk->left_count) 48 return CHUNK_PLUS; 49 if (chunk->left_count != chunk->right_count) 50 return CHUNK_ERROR; 51 return CHUNK_SAME; 52 } 53 54 static int 55 read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len) 56 { 57 int r; 58 if (fseeko(f, at_pos, SEEK_SET) == -1) 59 return errno; 60 r = fread(buf, sizeof(char), len, f); 61 if ((r == 0 || r < len) && ferror(f)) 62 return EIO; 63 if (r != len) 64 return EIO; 65 return 0; 66 } 67 68 static int 69 buf_cmp(const unsigned char *left, size_t left_len, 70 const unsigned char *right, size_t right_len, 71 bool ignore_whitespace) 72 { 73 int cmp; 74 75 if (ignore_whitespace) { 76 int il = 0, ir = 0; 77 while (il < left_len && ir < right_len) { 78 unsigned char cl = left[il]; 79 unsigned char cr = right[ir]; 80 81 if (isspace((unsigned char)cl) && il < left_len) { 82 il++; 83 continue; 84 } 85 if (isspace((unsigned char)cr) && ir < right_len) { 86 ir++; 87 continue; 88 } 89 90 if (cl > cr) 91 return 1; 92 if (cr > cl) 93 return -1; 94 il++; 95 ir++; 96 } 97 while (il < left_len) { 98 unsigned char cl = left[il++]; 99 if (!isspace((unsigned char)cl)) 100 return 1; 101 } 102 while (ir < right_len) { 103 unsigned char cr = right[ir++]; 104 if (!isspace((unsigned char)cr)) 105 return -1; 106 } 107 108 return 0; 109 } 110 111 cmp = memcmp(left, right, MIN(left_len, right_len)); 112 if (cmp) 113 return cmp; 114 if (left_len == right_len) 115 return 0; 116 return (left_len > right_len) ? 1 : -1; 117 } 118 119 int 120 diff_atom_cmp(int *cmp, 121 const struct diff_atom *left, 122 const struct diff_atom *right) 123 { 124 off_t remain_left, remain_right; 125 int flags = (left->root->diff_flags | right->root->diff_flags); 126 bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE); 127 128 if (!left->len && !right->len) { 129 *cmp = 0; 130 return 0; 131 } 132 if (!ignore_whitespace) { 133 if (!right->len) { 134 *cmp = 1; 135 return 0; 136 } 137 if (!left->len) { 138 *cmp = -1; 139 return 0; 140 } 141 } 142 143 if (left->at != NULL && right->at != NULL) { 144 *cmp = buf_cmp(left->at, left->len, right->at, right->len, 145 ignore_whitespace); 146 return 0; 147 } 148 149 remain_left = left->len; 150 remain_right = right->len; 151 while (remain_left > 0 || remain_right > 0) { 152 const size_t chunksz = 8192; 153 unsigned char buf_left[chunksz], buf_right[chunksz]; 154 const uint8_t *p_left, *p_right; 155 off_t n_left, n_right; 156 ssize_t r; 157 158 if (!remain_right) { 159 *cmp = 1; 160 return 0; 161 } 162 if (!remain_left) { 163 *cmp = -1; 164 return 0; 165 } 166 167 n_left = MIN(chunksz, remain_left); 168 n_right = MIN(chunksz, remain_right); 169 170 if (left->at == NULL) { 171 r = read_at(left->root->f, 172 left->pos + (left->len - remain_left), 173 buf_left, n_left); 174 if (r) { 175 *cmp = 0; 176 return r; 177 } 178 p_left = buf_left; 179 } else { 180 p_left = left->at + (left->len - remain_left); 181 } 182 183 if (right->at == NULL) { 184 r = read_at(right->root->f, 185 right->pos + (right->len - remain_right), 186 buf_right, n_right); 187 if (r) { 188 *cmp = 0; 189 return r; 190 } 191 p_right = buf_right; 192 } else { 193 p_right = right->at + (right->len - remain_right); 194 } 195 196 r = buf_cmp(p_left, n_left, p_right, n_right, 197 ignore_whitespace); 198 if (r) { 199 *cmp = r; 200 return 0; 201 } 202 203 remain_left -= n_left; 204 remain_right -= n_right; 205 } 206 207 *cmp = 0; 208 return 0; 209 } 210 211 int 212 diff_atom_same(bool *same, 213 const struct diff_atom *left, 214 const struct diff_atom *right) 215 { 216 int cmp; 217 int r; 218 if (left->hash != right->hash) { 219 *same = false; 220 return 0; 221 } 222 r = diff_atom_cmp(&cmp, left, right); 223 if (r) { 224 *same = true; 225 return r; 226 } 227 *same = (cmp == 0); 228 return 0; 229 } 230 231 static struct diff_chunk * 232 diff_state_add_solved_chunk(struct diff_state *state, 233 const struct diff_chunk *chunk) 234 { 235 diff_chunk_arraylist_t *result; 236 struct diff_chunk *new_chunk; 237 enum diff_chunk_type last_t; 238 enum diff_chunk_type new_t; 239 struct diff_chunk *last; 240 241 /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk 242 * never directly follows a plus chunk. */ 243 result = &state->result->chunks; 244 245 last_t = result->len ? diff_chunk_type(&result->head[result->len - 1]) 246 : CHUNK_EMPTY; 247 new_t = diff_chunk_type(chunk); 248 249 debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED", 250 result->len); 251 debug("L\n"); 252 debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count); 253 debug("R\n"); 254 debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count); 255 256 if (result->len) { 257 last = &result->head[result->len - 1]; 258 assert(chunk->left_start 259 == last->left_start + last->left_count); 260 assert(chunk->right_start 261 == last->right_start + last->right_count); 262 } 263 264 if (new_t == last_t) { 265 new_chunk = &result->head[result->len - 1]; 266 new_chunk->left_count += chunk->left_count; 267 new_chunk->right_count += chunk->right_count; 268 debug(" - added chunk touches previous one of same type, joined:\n"); 269 debug("L\n"); 270 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count); 271 debug("R\n"); 272 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count); 273 } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) { 274 enum diff_chunk_type prev_last_t = 275 result->len > 1 ? 276 diff_chunk_type(&result->head[result->len - 2]) 277 : CHUNK_EMPTY; 278 /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk-> 279 * Is the one before that also a minus? combine. */ 280 if (prev_last_t == CHUNK_MINUS) { 281 new_chunk = &result->head[result->len - 2]; 282 new_chunk->left_count += chunk->left_count; 283 new_chunk->right_count += chunk->right_count; 284 285 debug(" - added minus-chunk follows plus-chunk," 286 " put before that plus-chunk and joined" 287 " with preceding minus-chunk:\n"); 288 debug("L\n"); 289 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count); 290 debug("R\n"); 291 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count); 292 } else { 293 ARRAYLIST_INSERT(new_chunk, *result, result->len - 1); 294 if (!new_chunk) 295 return NULL; 296 *new_chunk = *chunk; 297 298 /* The new minus chunk indicates to which position on 299 * the right it corresponds, even though it doesn't add 300 * any lines on the right. By moving above a plus chunk, 301 * that position on the right has shifted. */ 302 last = &result->head[result->len - 1]; 303 new_chunk->right_start = last->right_start; 304 305 debug(" - added minus-chunk follows plus-chunk," 306 " put before that plus-chunk\n"); 307 } 308 309 /* That last_t == CHUNK_PLUS indicates to which position on the 310 * left it corresponds, even though it doesn't add any lines on 311 * the left. By inserting/extending the prev_last_t == 312 * CHUNK_MINUS, that position on the left has shifted. */ 313 last = &result->head[result->len - 1]; 314 last->left_start = new_chunk->left_start 315 + new_chunk->left_count; 316 317 } else { 318 ARRAYLIST_ADD(new_chunk, *result); 319 if (!new_chunk) 320 return NULL; 321 *new_chunk = *chunk; 322 } 323 return new_chunk; 324 } 325 326 /* Even if a left or right side is empty, diff output may need to know the 327 * position in that file. 328 * So left_start or right_start must never be NULL -- pass left_count or 329 * right_count as zero to indicate staying at that position without consuming 330 * any lines. */ 331 struct diff_chunk * 332 diff_state_add_chunk(struct diff_state *state, bool solved, 333 struct diff_atom *left_start, unsigned int left_count, 334 struct diff_atom *right_start, unsigned int right_count) 335 { 336 struct diff_chunk *new_chunk; 337 struct diff_chunk chunk = { 338 .solved = solved, 339 .left_start = left_start, 340 .left_count = left_count, 341 .right_start = right_start, 342 .right_count = right_count, 343 }; 344 345 /* An unsolved chunk means store as intermediate result for later 346 * re-iteration. 347 * If there already are intermediate results, that means even a 348 * following solved chunk needs to go to intermediate results, so that 349 * it is later put in the final correct position in solved chunks. 350 */ 351 if (!solved || state->temp_result.len) { 352 /* Append to temp_result */ 353 debug("ADD %s chunk to temp result:\n", 354 chunk.solved ? "solved" : "UNSOLVED"); 355 debug("L\n"); 356 debug_dump_atoms(&state->left, left_start, left_count); 357 debug("R\n"); 358 debug_dump_atoms(&state->right, right_start, right_count); 359 ARRAYLIST_ADD(new_chunk, state->temp_result); 360 if (!new_chunk) 361 return NULL; 362 *new_chunk = chunk; 363 return new_chunk; 364 } 365 366 return diff_state_add_solved_chunk(state, &chunk); 367 } 368 369 static void 370 diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data, 371 unsigned long long len, int diff_flags) 372 { 373 *d = (struct diff_data){ 374 .f = f, 375 .pos = 0, 376 .data = data, 377 .len = len, 378 .root = d, 379 .diff_flags = diff_flags, 380 }; 381 } 382 383 void 384 diff_data_init_subsection(struct diff_data *d, struct diff_data *parent, 385 struct diff_atom *from_atom, unsigned int atoms_count) 386 { 387 struct diff_atom *last_atom; 388 389 debug("diff_data %p parent %p from_atom %p atoms_count %u\n", 390 d, parent, from_atom, atoms_count); 391 debug(" from_atom "); 392 debug_dump_atom(parent, NULL, from_atom); 393 394 if (atoms_count == 0) { 395 *d = (struct diff_data){ 396 .f = NULL, 397 .pos = 0, 398 .data = NULL, 399 .len = 0, 400 .root = parent->root, 401 .atoms.head = NULL, 402 .atoms.len = atoms_count, 403 }; 404 405 return; 406 } 407 408 last_atom = from_atom + atoms_count - 1; 409 *d = (struct diff_data){ 410 .f = NULL, 411 .pos = from_atom->pos, 412 .data = from_atom->at, 413 .len = (last_atom->pos + last_atom->len) - from_atom->pos, 414 .root = parent->root, 415 .atoms.head = from_atom, 416 .atoms.len = atoms_count, 417 }; 418 419 debug("subsection:\n"); 420 debug_dump(d); 421 } 422 423 void 424 diff_data_free(struct diff_data *diff_data) 425 { 426 if (!diff_data) 427 return; 428 if (diff_data->atoms.allocated) 429 ARRAYLIST_FREE(diff_data->atoms); 430 } 431 432 int 433 diff_algo_none(const struct diff_algo_config *algo_config, 434 struct diff_state *state) 435 { 436 debug("\n** %s\n", __func__); 437 debug("left:\n"); 438 debug_dump(&state->left); 439 debug("right:\n"); 440 debug_dump(&state->right); 441 debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL, 442 0); 443 444 /* Add a chunk of equal lines, if any */ 445 struct diff_atom *l = state->left.atoms.head; 446 unsigned int l_len = state->left.atoms.len; 447 struct diff_atom *r = state->right.atoms.head; 448 unsigned int r_len = state->right.atoms.len; 449 unsigned int equal_atoms_start = 0; 450 unsigned int equal_atoms_end = 0; 451 unsigned int l_idx = 0; 452 unsigned int r_idx = 0; 453 454 while (equal_atoms_start < l_len 455 && equal_atoms_start < r_len) { 456 int err; 457 bool same; 458 err = diff_atom_same(&same, &l[equal_atoms_start], 459 &r[equal_atoms_start]); 460 if (err) 461 return err; 462 if (!same) 463 break; 464 equal_atoms_start++; 465 } 466 while (equal_atoms_end < (l_len - equal_atoms_start) 467 && equal_atoms_end < (r_len - equal_atoms_start)) { 468 int err; 469 bool same; 470 err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end], 471 &r[r_len - 1 - equal_atoms_end]); 472 if (err) 473 return err; 474 if (!same) 475 break; 476 equal_atoms_end++; 477 } 478 479 /* Add a chunk of equal lines at the start */ 480 if (equal_atoms_start) { 481 if (!diff_state_add_chunk(state, true, 482 l, equal_atoms_start, 483 r, equal_atoms_start)) 484 return ENOMEM; 485 l_idx += equal_atoms_start; 486 r_idx += equal_atoms_start; 487 } 488 489 /* Add a "minus" chunk with all lines from the left. */ 490 if (equal_atoms_start + equal_atoms_end < l_len) { 491 unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end; 492 if (!diff_state_add_chunk(state, true, 493 &l[l_idx], add_len, 494 &r[r_idx], 0)) 495 return ENOMEM; 496 l_idx += add_len; 497 } 498 499 /* Add a "plus" chunk with all lines from the right. */ 500 if (equal_atoms_start + equal_atoms_end < r_len) { 501 unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end; 502 if (!diff_state_add_chunk(state, true, 503 &l[l_idx], 0, 504 &r[r_idx], add_len)) 505 return ENOMEM; 506 r_idx += add_len; 507 } 508 509 /* Add a chunk of equal lines at the end */ 510 if (equal_atoms_end) { 511 if (!diff_state_add_chunk(state, true, 512 &l[l_idx], equal_atoms_end, 513 &r[r_idx], equal_atoms_end)) 514 return ENOMEM; 515 } 516 517 return DIFF_RC_OK; 518 } 519 520 static int 521 diff_run_algo(const struct diff_algo_config *algo_config, 522 struct diff_state *state) 523 { 524 int rc; 525 526 if (!algo_config || !algo_config->impl 527 || !state->recursion_depth_left 528 || !state->left.atoms.len || !state->right.atoms.len) { 529 debug("Fall back to diff_algo_none():%s%s%s\n", 530 (!algo_config || !algo_config->impl) ? " no-cfg" : "", 531 (!state->recursion_depth_left) ? " max-depth" : "", 532 (!state->left.atoms.len || !state->right.atoms.len)? 533 " trivial" : ""); 534 return diff_algo_none(algo_config, state); 535 } 536 537 ARRAYLIST_FREE(state->temp_result); 538 ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE); 539 rc = algo_config->impl(algo_config, state); 540 switch (rc) { 541 case DIFF_RC_USE_DIFF_ALGO_FALLBACK: 542 debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", 543 algo_config->fallback_algo); 544 rc = diff_run_algo(algo_config->fallback_algo, state); 545 goto return_rc; 546 547 case DIFF_RC_OK: 548 /* continue below */ 549 break; 550 551 default: 552 /* some error happened */ 553 goto return_rc; 554 } 555 556 /* Pick up any diff chunks that are still unsolved and feed to 557 * inner_algo. inner_algo will solve unsolved chunks and append to 558 * result, and subsequent solved chunks on this level are then appended 559 * to result afterwards. */ 560 int i; 561 for (i = 0; i < state->temp_result.len; i++) { 562 struct diff_chunk *c = &state->temp_result.head[i]; 563 if (c->solved) { 564 diff_state_add_solved_chunk(state, c); 565 continue; 566 } 567 568 /* c is an unsolved chunk, feed to inner_algo */ 569 struct diff_state inner_state = { 570 .result = state->result, 571 .recursion_depth_left = state->recursion_depth_left - 1, 572 .kd_buf = state->kd_buf, 573 .kd_buf_size = state->kd_buf_size, 574 }; 575 diff_data_init_subsection(&inner_state.left, &state->left, 576 c->left_start, c->left_count); 577 diff_data_init_subsection(&inner_state.right, &state->right, 578 c->right_start, c->right_count); 579 580 rc = diff_run_algo(algo_config->inner_algo, &inner_state); 581 state->kd_buf = inner_state.kd_buf; 582 state->kd_buf_size = inner_state.kd_buf_size; 583 if (rc != DIFF_RC_OK) 584 goto return_rc; 585 } 586 587 rc = DIFF_RC_OK; 588 return_rc: 589 ARRAYLIST_FREE(state->temp_result); 590 return rc; 591 } 592 593 int 594 diff_atomize_file(struct diff_data *d, 595 const struct diff_config *config, 596 FILE *f, const uint8_t *data, off_t len, int diff_flags) 597 { 598 if (!config->atomize_func) 599 return EINVAL; 600 601 diff_data_init_root(d, f, data, len, diff_flags); 602 603 return config->atomize_func(config->atomize_func_data, d); 604 605 } 606 607 struct diff_result * 608 diff_main(const struct diff_config *config, struct diff_data *left, 609 struct diff_data *right) 610 { 611 struct diff_result *result = malloc(sizeof(struct diff_result)); 612 if (!result) 613 return NULL; 614 615 *result = (struct diff_result){}; 616 result->left = left; 617 result->right = right; 618 619 struct diff_state state = { 620 .result = result, 621 .recursion_depth_left = config->max_recursion_depth ? 622 config->max_recursion_depth : UINT_MAX, 623 .kd_buf = NULL, 624 .kd_buf_size = 0, 625 }; 626 diff_data_init_subsection(&state.left, left, 627 left->atoms.head, 628 left->atoms.len); 629 diff_data_init_subsection(&state.right, right, 630 right->atoms.head, 631 right->atoms.len); 632 633 result->rc = diff_run_algo(config->algo, &state); 634 free(state.kd_buf); 635 636 return result; 637 } 638 639 void 640 diff_result_free(struct diff_result *result) 641 { 642 if (!result) 643 return; 644 ARRAYLIST_FREE(result->chunks); 645 free(result); 646 } 647 648 int 649 diff_result_contains_printable_chunks(struct diff_result *result) 650 { 651 struct diff_chunk *c; 652 enum diff_chunk_type t; 653 int i; 654 655 for (i = 0; i < result->chunks.len; i++) { 656 c = &result->chunks.head[i]; 657 t = diff_chunk_type(c); 658 if (t == CHUNK_MINUS || t == CHUNK_PLUS) 659 return 1; 660 } 661 662 return 0; 663 } 664