xref: /freebsd/contrib/libdiff/lib/diff_main.c (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 /* Generic infrastructure to implement various diff algorithms (implementation). */
2 /*
3  * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 
19 #include <sys/queue.h>
20 #include <ctype.h>
21 #include <errno.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <stdbool.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <limits.h>
28 #include <unistd.h>
29 
30 #include <assert.h>
31 
32 #include <arraylist.h>
33 #include <diff_main.h>
34 
35 #include "diff_internal.h"
36 #include "diff_debug.h"
37 
38 inline enum diff_chunk_type
39 diff_chunk_type(const struct diff_chunk *chunk)
40 {
41 	if (!chunk->left_count && !chunk->right_count)
42 		return CHUNK_EMPTY;
43 	if (!chunk->solved)
44 		return CHUNK_ERROR;
45 	if (!chunk->right_count)
46 		return CHUNK_MINUS;
47 	if (!chunk->left_count)
48 		return CHUNK_PLUS;
49 	if (chunk->left_count != chunk->right_count)
50 		return CHUNK_ERROR;
51 	return CHUNK_SAME;
52 }
53 
54 static int
55 read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
56 {
57 	ssize_t r;
58 
59 	if (fseeko(f, at_pos, SEEK_SET) == -1)
60 		return errno;
61 	r = fread(buf, sizeof(char), len, f);
62 	if ((r == 0 || r < len) && ferror(f))
63 		return EIO;
64 	if (r != len)
65 		return EIO;
66 	return 0;
67 }
68 
69 static int
70 buf_cmp(const unsigned char *left, size_t left_len,
71 	const unsigned char *right, size_t right_len,
72 	bool ignore_whitespace)
73 {
74 	int cmp;
75 
76 	if (ignore_whitespace) {
77 		int il = 0, ir = 0;
78 		while (il < left_len && ir < right_len) {
79 			unsigned char cl = left[il];
80 			unsigned char cr = right[ir];
81 
82 			if (isspace((unsigned char)cl) && il < left_len) {
83 				il++;
84 				continue;
85 			}
86 			if (isspace((unsigned char)cr) && ir < right_len) {
87 				ir++;
88 				continue;
89 			}
90 
91 			if (cl > cr)
92 				return 1;
93 			if (cr > cl)
94 				return -1;
95 			il++;
96 			ir++;
97 		}
98 		while (il < left_len) {
99 			unsigned char cl = left[il++];
100 			if (!isspace((unsigned char)cl))
101 				return 1;
102 		}
103 		while (ir < right_len) {
104 			unsigned char cr = right[ir++];
105 			if (!isspace((unsigned char)cr))
106 				return -1;
107 		}
108 
109 		return 0;
110 	}
111 
112 	cmp = memcmp(left, right, MIN(left_len, right_len));
113 	if (cmp)
114 		return cmp;
115 	if (left_len == right_len)
116 		return 0;
117 	return (left_len > right_len) ? 1 : -1;
118 }
119 
120 int
121 diff_atom_cmp(int *cmp,
122 	      const struct diff_atom *left,
123 	      const struct diff_atom *right)
124 {
125 	off_t remain_left, remain_right;
126 	int flags = (left->root->diff_flags | right->root->diff_flags);
127 	bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
128 
129 	if (!left->len && !right->len) {
130 		*cmp = 0;
131 		return 0;
132 	}
133 	if (!ignore_whitespace) {
134 		if (!right->len) {
135 			*cmp = 1;
136 			return 0;
137 		}
138 		if (!left->len) {
139 			*cmp = -1;
140 			return 0;
141 		}
142 	}
143 
144 	if (left->at != NULL && right->at != NULL) {
145 		*cmp = buf_cmp(left->at, left->len, right->at, right->len,
146 		    ignore_whitespace);
147 		return 0;
148 	}
149 
150 	remain_left = left->len;
151 	remain_right = right->len;
152 	while (remain_left > 0 || remain_right > 0) {
153 		const size_t chunksz = 8192;
154 		unsigned char buf_left[chunksz], buf_right[chunksz];
155 		const uint8_t *p_left, *p_right;
156 		off_t n_left, n_right;
157 		int r;
158 
159 		if (!remain_right) {
160 			*cmp = 1;
161 			return 0;
162 		}
163 		if (!remain_left) {
164 			*cmp = -1;
165 			return 0;
166 		}
167 
168 		n_left = MIN(chunksz, remain_left);
169 		n_right = MIN(chunksz, remain_right);
170 
171 		if (left->at == NULL) {
172 			r = read_at(left->root->f,
173 				    left->pos + (left->len - remain_left),
174 				    buf_left, n_left);
175 			if (r) {
176 				*cmp = 0;
177 				return r;
178 			}
179 			p_left = buf_left;
180 		} else {
181 			p_left = left->at + (left->len - remain_left);
182 		}
183 
184 		if (right->at == NULL) {
185 			r = read_at(right->root->f,
186 				    right->pos + (right->len - remain_right),
187 				    buf_right, n_right);
188 			if (r) {
189 				*cmp = 0;
190 				return r;
191 			}
192 			p_right = buf_right;
193 		} else {
194 			p_right = right->at + (right->len - remain_right);
195 		}
196 
197 		r = buf_cmp(p_left, n_left, p_right, n_right,
198 		    ignore_whitespace);
199 		if (r) {
200 			*cmp = r;
201 			return 0;
202 		}
203 
204 		remain_left -= n_left;
205 		remain_right -= n_right;
206 	}
207 
208 	*cmp = 0;
209 	return 0;
210 }
211 
212 int
213 diff_atom_same(bool *same,
214 	       const struct diff_atom *left,
215 	       const struct diff_atom *right)
216 {
217 	int cmp;
218 	int r;
219 	if (left->hash != right->hash) {
220 		*same = false;
221 		return 0;
222 	}
223 	r = diff_atom_cmp(&cmp, left, right);
224 	if (r) {
225 		*same = true;
226 		return r;
227 	}
228 	*same = (cmp == 0);
229 	return 0;
230 }
231 
232 static struct diff_chunk *
233 diff_state_add_solved_chunk(struct diff_state *state,
234 			    const struct diff_chunk *chunk)
235 {
236 	diff_chunk_arraylist_t *result;
237 	struct diff_chunk *new_chunk;
238 	enum diff_chunk_type last_t;
239 	enum diff_chunk_type new_t;
240 	struct diff_chunk *last;
241 
242 	/* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
243 	 * never directly follows a plus chunk. */
244 	result = &state->result->chunks;
245 
246 	last_t = result->len ? diff_chunk_type(&result->head[result->len - 1])
247 		: CHUNK_EMPTY;
248 	new_t = diff_chunk_type(chunk);
249 
250 	debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED",
251 	      result->len);
252 	debug("L\n");
253 	debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
254 	debug("R\n");
255 	debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
256 
257 	if (result->len) {
258 		last = &result->head[result->len - 1];
259 		assert(chunk->left_start
260 		       == last->left_start + last->left_count);
261 		assert(chunk->right_start
262 		       == last->right_start + last->right_count);
263 	}
264 
265 	if (new_t == last_t) {
266 		new_chunk = &result->head[result->len - 1];
267 		new_chunk->left_count += chunk->left_count;
268 		new_chunk->right_count += chunk->right_count;
269 		debug("  - added chunk touches previous one of same type, joined:\n");
270 		debug("L\n");
271 		debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
272 		debug("R\n");
273 		debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
274 	} else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
275 		enum diff_chunk_type prev_last_t =
276 			result->len > 1 ?
277 				diff_chunk_type(&result->head[result->len - 2])
278 				: CHUNK_EMPTY;
279 		/* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
280 		 * Is the one before that also a minus? combine. */
281 		if (prev_last_t == CHUNK_MINUS) {
282 			new_chunk = &result->head[result->len - 2];
283 			new_chunk->left_count += chunk->left_count;
284 			new_chunk->right_count += chunk->right_count;
285 
286 			debug("  - added minus-chunk follows plus-chunk,"
287 			      " put before that plus-chunk and joined"
288 			      " with preceding minus-chunk:\n");
289 			debug("L\n");
290 			debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
291 			debug("R\n");
292 			debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
293 		} else {
294 			ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
295 			if (!new_chunk)
296 				return NULL;
297 			*new_chunk = *chunk;
298 
299 			/* The new minus chunk indicates to which position on
300 			 * the right it corresponds, even though it doesn't add
301 			 * any lines on the right. By moving above a plus chunk,
302 			 * that position on the right has shifted. */
303 			last = &result->head[result->len - 1];
304 			new_chunk->right_start = last->right_start;
305 
306 			debug("  - added minus-chunk follows plus-chunk,"
307 			      " put before that plus-chunk\n");
308 		}
309 
310 		/* That last_t == CHUNK_PLUS indicates to which position on the
311 		 * left it corresponds, even though it doesn't add any lines on
312 		 * the left. By inserting/extending the prev_last_t ==
313 		 * CHUNK_MINUS, that position on the left has shifted. */
314 		last = &result->head[result->len - 1];
315 		last->left_start = new_chunk->left_start
316 			+ new_chunk->left_count;
317 
318 	} else {
319 		ARRAYLIST_ADD(new_chunk, *result);
320 		if (!new_chunk)
321 			return NULL;
322 		*new_chunk = *chunk;
323 	}
324 	return new_chunk;
325 }
326 
327 /* Even if a left or right side is empty, diff output may need to know the
328  * position in that file.
329  * So left_start or right_start must never be NULL -- pass left_count or
330  * right_count as zero to indicate staying at that position without consuming
331  * any lines. */
332 struct diff_chunk *
333 diff_state_add_chunk(struct diff_state *state, bool solved,
334 		     struct diff_atom *left_start, unsigned int left_count,
335 		     struct diff_atom *right_start, unsigned int right_count)
336 {
337 	struct diff_chunk *new_chunk;
338 	struct diff_chunk chunk = {
339 		.solved = solved,
340 		.left_start = left_start,
341 		.left_count = left_count,
342 		.right_start = right_start,
343 		.right_count = right_count,
344 	};
345 
346 	/* An unsolved chunk means store as intermediate result for later
347 	 * re-iteration.
348 	 * If there already are intermediate results, that means even a
349 	 * following solved chunk needs to go to intermediate results, so that
350 	 * it is later put in the final correct position in solved chunks.
351 	 */
352 	if (!solved || state->temp_result.len) {
353 		/* Append to temp_result */
354 		debug("ADD %s chunk to temp result:\n",
355 		      chunk.solved ? "solved" : "UNSOLVED");
356 		debug("L\n");
357 		debug_dump_atoms(&state->left, left_start, left_count);
358 		debug("R\n");
359 		debug_dump_atoms(&state->right, right_start, right_count);
360 		ARRAYLIST_ADD(new_chunk, state->temp_result);
361 		if (!new_chunk)
362 			return NULL;
363 		*new_chunk = chunk;
364 		return new_chunk;
365 	}
366 
367 	return diff_state_add_solved_chunk(state, &chunk);
368 }
369 
370 static void
371 diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
372     unsigned long long len, int diff_flags)
373 {
374 	*d = (struct diff_data){
375 		.f = f,
376 		.pos = 0,
377 		.data = data,
378 		.len = len,
379 		.root = d,
380 		.diff_flags = diff_flags,
381 	};
382 }
383 
384 void
385 diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
386 			  struct diff_atom *from_atom, unsigned int atoms_count)
387 {
388 	struct diff_atom *last_atom;
389 
390 	debug("diff_data %p  parent %p  from_atom %p  atoms_count %u\n",
391 	      d, parent, from_atom, atoms_count);
392 	debug("  from_atom ");
393 	debug_dump_atom(parent, NULL, from_atom);
394 
395 	if (atoms_count == 0) {
396 		*d = (struct diff_data){
397 			.f = NULL,
398 			.pos = 0,
399 			.data = NULL,
400 			.len = 0,
401 			.root = parent->root,
402 			.atoms.head = NULL,
403 			.atoms.len = atoms_count,
404 		};
405 
406 		return;
407 	}
408 
409 	last_atom = from_atom + atoms_count - 1;
410 	*d = (struct diff_data){
411 		.f = NULL,
412 		.pos = from_atom->pos,
413 		.data = from_atom->at,
414 		.len = (last_atom->pos + last_atom->len) - from_atom->pos,
415 		.root = parent->root,
416 		.atoms.head = from_atom,
417 		.atoms.len = atoms_count,
418 	};
419 
420 	debug("subsection:\n");
421 	debug_dump(d);
422 }
423 
424 void
425 diff_data_free(struct diff_data *diff_data)
426 {
427 	if (!diff_data)
428 		return;
429 	if (diff_data->atoms.allocated)
430 		ARRAYLIST_FREE(diff_data->atoms);
431 }
432 
433 int
434 diff_algo_none(const struct diff_algo_config *algo_config,
435 	       struct diff_state *state)
436 {
437 	debug("\n** %s\n", __func__);
438 	debug("left:\n");
439 	debug_dump(&state->left);
440 	debug("right:\n");
441 	debug_dump(&state->right);
442 	debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
443 			       0);
444 
445 	/* Add a chunk of equal lines, if any */
446 	struct diff_atom *l = state->left.atoms.head;
447 	unsigned int l_len = state->left.atoms.len;
448 	struct diff_atom *r = state->right.atoms.head;
449 	unsigned int r_len = state->right.atoms.len;
450 	unsigned int equal_atoms_start = 0;
451 	unsigned int equal_atoms_end = 0;
452 	unsigned int l_idx = 0;
453 	unsigned int r_idx = 0;
454 
455 	while (equal_atoms_start < l_len
456 	       && equal_atoms_start < r_len) {
457 		int err;
458 		bool same;
459 		err = diff_atom_same(&same, &l[equal_atoms_start],
460 				     &r[equal_atoms_start]);
461 		if (err)
462 			return err;
463 		if (!same)
464 			break;
465 		equal_atoms_start++;
466 	}
467 	while (equal_atoms_end < (l_len - equal_atoms_start)
468 	       && equal_atoms_end < (r_len - equal_atoms_start)) {
469 		int err;
470 		bool same;
471 		err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
472 				   &r[r_len - 1 - equal_atoms_end]);
473 		if (err)
474 			return err;
475 		if (!same)
476 			break;
477 		equal_atoms_end++;
478 	}
479 
480 	/* Add a chunk of equal lines at the start */
481 	if (equal_atoms_start) {
482 		if (!diff_state_add_chunk(state, true,
483 					  l, equal_atoms_start,
484 					  r, equal_atoms_start))
485 			return ENOMEM;
486 		l_idx += equal_atoms_start;
487 		r_idx += equal_atoms_start;
488 	}
489 
490 	/* Add a "minus" chunk with all lines from the left. */
491 	if (equal_atoms_start + equal_atoms_end < l_len) {
492 		unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
493 		if (!diff_state_add_chunk(state, true,
494 					  &l[l_idx], add_len,
495 					  &r[r_idx], 0))
496 			return ENOMEM;
497 		l_idx += add_len;
498 	}
499 
500 	/* Add a "plus" chunk with all lines from the right. */
501 	if (equal_atoms_start + equal_atoms_end < r_len) {
502 		unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
503 		if (!diff_state_add_chunk(state, true,
504 					  &l[l_idx], 0,
505 					  &r[r_idx], add_len))
506 			return ENOMEM;
507 		r_idx += add_len;
508 	}
509 
510 	/* Add a chunk of equal lines at the end */
511 	if (equal_atoms_end) {
512 		if (!diff_state_add_chunk(state, true,
513 					  &l[l_idx], equal_atoms_end,
514 					  &r[r_idx], equal_atoms_end))
515 			return ENOMEM;
516 	}
517 
518 	return DIFF_RC_OK;
519 }
520 
521 static int
522 diff_run_algo(const struct diff_algo_config *algo_config,
523 	      struct diff_state *state)
524 {
525 	int rc;
526 
527 	if (!algo_config || !algo_config->impl
528 	    || !state->recursion_depth_left
529 	    || !state->left.atoms.len || !state->right.atoms.len) {
530 		debug("Fall back to diff_algo_none():%s%s%s\n",
531 		      (!algo_config || !algo_config->impl) ? " no-cfg" : "",
532 		      (!state->recursion_depth_left) ? " max-depth" : "",
533 		      (!state->left.atoms.len || !state->right.atoms.len)?
534 			      " trivial" : "");
535 		return diff_algo_none(algo_config, state);
536 	}
537 
538 	ARRAYLIST_FREE(state->temp_result);
539 	ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
540 	rc = algo_config->impl(algo_config, state);
541 	switch (rc) {
542 	case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
543 		debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
544 		      algo_config->fallback_algo);
545 		rc = diff_run_algo(algo_config->fallback_algo, state);
546 		goto return_rc;
547 
548 	case DIFF_RC_OK:
549 		/* continue below */
550 		break;
551 
552 	default:
553 		/* some error happened */
554 		goto return_rc;
555 	}
556 
557 	/* Pick up any diff chunks that are still unsolved and feed to
558 	 * inner_algo.  inner_algo will solve unsolved chunks and append to
559 	 * result, and subsequent solved chunks on this level are then appended
560 	 * to result afterwards. */
561 	int i;
562 	for (i = 0; i < state->temp_result.len; i++) {
563 		struct diff_chunk *c = &state->temp_result.head[i];
564 		if (c->solved) {
565 			diff_state_add_solved_chunk(state, c);
566 			continue;
567 		}
568 
569 		/* c is an unsolved chunk, feed to inner_algo */
570 		struct diff_state inner_state = {
571 			.result = state->result,
572 			.recursion_depth_left = state->recursion_depth_left - 1,
573 			.kd_buf = state->kd_buf,
574 			.kd_buf_size = state->kd_buf_size,
575 		};
576 		diff_data_init_subsection(&inner_state.left, &state->left,
577 					  c->left_start, c->left_count);
578 		diff_data_init_subsection(&inner_state.right, &state->right,
579 					  c->right_start, c->right_count);
580 
581 		rc = diff_run_algo(algo_config->inner_algo, &inner_state);
582 		state->kd_buf = inner_state.kd_buf;
583 		state->kd_buf_size = inner_state.kd_buf_size;
584 		if (rc != DIFF_RC_OK)
585 			goto return_rc;
586 	}
587 
588 	rc = DIFF_RC_OK;
589 return_rc:
590 	ARRAYLIST_FREE(state->temp_result);
591 	return rc;
592 }
593 
594 int
595 diff_atomize_file(struct diff_data *d,
596 		  const struct diff_config *config,
597 		  FILE *f, const uint8_t *data, off_t len, int diff_flags)
598 {
599 	if (!config->atomize_func)
600 		return EINVAL;
601 
602 	diff_data_init_root(d, f, data, len, diff_flags);
603 
604 	return config->atomize_func(config->atomize_func_data, d);
605 
606 }
607 
608 struct diff_result *
609 diff_main(const struct diff_config *config, struct diff_data *left,
610 	  struct diff_data *right)
611 {
612 	struct diff_result *result = malloc(sizeof(struct diff_result));
613 	if (!result)
614 		return NULL;
615 
616 	*result = (struct diff_result){};
617 	result->left = left;
618 	result->right = right;
619 
620 	struct diff_state state = {
621 		.result = result,
622 		.recursion_depth_left = config->max_recursion_depth ?
623 		    config->max_recursion_depth : UINT_MAX,
624 		.kd_buf = NULL,
625 		.kd_buf_size = 0,
626 	};
627 	diff_data_init_subsection(&state.left, left,
628 				  left->atoms.head,
629 				  left->atoms.len);
630 	diff_data_init_subsection(&state.right, right,
631 				  right->atoms.head,
632 				  right->atoms.len);
633 
634 	result->rc = diff_run_algo(config->algo, &state);
635 	free(state.kd_buf);
636 
637 	return result;
638 }
639 
640 void
641 diff_result_free(struct diff_result *result)
642 {
643 	if (!result)
644 		return;
645 	ARRAYLIST_FREE(result->chunks);
646 	free(result);
647 }
648 
649 int
650 diff_result_contains_printable_chunks(struct diff_result *result)
651 {
652 	struct diff_chunk *c;
653 	enum diff_chunk_type t;
654 	int i;
655 
656 	for (i = 0; i < result->chunks.len; i++) {
657 		c = &result->chunks.head[i];
658 		t = diff_chunk_type(c);
659 		if (t == CHUNK_MINUS || t == CHUNK_PLUS)
660 			return 1;
661 	}
662 
663 	return 0;
664 }
665