xref: /freebsd/contrib/libdiff/lib/diff_main.c (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1 /* Generic infrastructure to implement various diff algorithms (implementation). */
2 /*
3  * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 
19 #include <sys/queue.h>
20 #include <ctype.h>
21 #include <errno.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <stdbool.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <limits.h>
28 #include <unistd.h>
29 
30 #include <assert.h>
31 
32 #include <arraylist.h>
33 #include <diff_main.h>
34 
35 #include "diff_internal.h"
36 #include "diff_debug.h"
37 
38 inline enum diff_chunk_type
39 diff_chunk_type(const struct diff_chunk *chunk)
40 {
41 	if (!chunk->left_count && !chunk->right_count)
42 		return CHUNK_EMPTY;
43 	if (!chunk->solved)
44 		return CHUNK_ERROR;
45 	if (!chunk->right_count)
46 		return CHUNK_MINUS;
47 	if (!chunk->left_count)
48 		return CHUNK_PLUS;
49 	if (chunk->left_count != chunk->right_count)
50 		return CHUNK_ERROR;
51 	return CHUNK_SAME;
52 }
53 
54 static int
55 read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
56 {
57 	int r;
58 	if (fseeko(f, at_pos, SEEK_SET) == -1)
59 		return errno;
60 	r = fread(buf, sizeof(char), len, f);
61 	if ((r == 0 || r < len) && ferror(f))
62 		return EIO;
63 	if (r != len)
64 		return EIO;
65 	return 0;
66 }
67 
68 static int
69 buf_cmp(const unsigned char *left, size_t left_len,
70 	const unsigned char *right, size_t right_len,
71 	bool ignore_whitespace)
72 {
73 	int cmp;
74 
75 	if (ignore_whitespace) {
76 		int il = 0, ir = 0;
77 		while (il < left_len && ir < right_len) {
78 			unsigned char cl = left[il];
79 			unsigned char cr = right[ir];
80 
81 			if (isspace((unsigned char)cl) && il < left_len) {
82 				il++;
83 				continue;
84 			}
85 			if (isspace((unsigned char)cr) && ir < right_len) {
86 				ir++;
87 				continue;
88 			}
89 
90 			if (cl > cr)
91 				return 1;
92 			if (cr > cl)
93 				return -1;
94 			il++;
95 			ir++;
96 		}
97 		while (il < left_len) {
98 			unsigned char cl = left[il++];
99 			if (!isspace((unsigned char)cl))
100 				return 1;
101 		}
102 		while (ir < right_len) {
103 			unsigned char cr = right[ir++];
104 			if (!isspace((unsigned char)cr))
105 				return -1;
106 		}
107 
108 		return 0;
109 	}
110 
111 	cmp = memcmp(left, right, MIN(left_len, right_len));
112 	if (cmp)
113 		return cmp;
114 	if (left_len == right_len)
115 		return 0;
116 	return (left_len > right_len) ? 1 : -1;
117 }
118 
119 int
120 diff_atom_cmp(int *cmp,
121 	      const struct diff_atom *left,
122 	      const struct diff_atom *right)
123 {
124 	off_t remain_left, remain_right;
125 	int flags = (left->root->diff_flags | right->root->diff_flags);
126 	bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
127 
128 	if (!left->len && !right->len) {
129 		*cmp = 0;
130 		return 0;
131 	}
132 	if (!ignore_whitespace) {
133 		if (!right->len) {
134 			*cmp = 1;
135 			return 0;
136 		}
137 		if (!left->len) {
138 			*cmp = -1;
139 			return 0;
140 		}
141 	}
142 
143 	if (left->at != NULL && right->at != NULL) {
144 		*cmp = buf_cmp(left->at, left->len, right->at, right->len,
145 		    ignore_whitespace);
146 		return 0;
147 	}
148 
149 	remain_left = left->len;
150 	remain_right = right->len;
151 	while (remain_left > 0 || remain_right > 0) {
152 		const size_t chunksz = 8192;
153 		unsigned char buf_left[chunksz], buf_right[chunksz];
154 		const uint8_t *p_left, *p_right;
155 		off_t n_left, n_right;
156 		ssize_t r;
157 
158 		if (!remain_right) {
159 			*cmp = 1;
160 			return 0;
161 		}
162 		if (!remain_left) {
163 			*cmp = -1;
164 			return 0;
165 		}
166 
167 		n_left = MIN(chunksz, remain_left);
168 		n_right = MIN(chunksz, remain_right);
169 
170 		if (left->at == NULL) {
171 			r = read_at(left->root->f,
172 				    left->pos + (left->len - remain_left),
173 				    buf_left, n_left);
174 			if (r) {
175 				*cmp = 0;
176 				return r;
177 			}
178 			p_left = buf_left;
179 		} else {
180 			p_left = left->at + (left->len - remain_left);
181 		}
182 
183 		if (right->at == NULL) {
184 			r = read_at(right->root->f,
185 				    right->pos + (right->len - remain_right),
186 				    buf_right, n_right);
187 			if (r) {
188 				*cmp = 0;
189 				return r;
190 			}
191 			p_right = buf_right;
192 		} else {
193 			p_right = right->at + (right->len - remain_right);
194 		}
195 
196 		r = buf_cmp(p_left, n_left, p_right, n_right,
197 		    ignore_whitespace);
198 		if (r) {
199 			*cmp = r;
200 			return 0;
201 		}
202 
203 		remain_left -= n_left;
204 		remain_right -= n_right;
205 	}
206 
207 	*cmp = 0;
208 	return 0;
209 }
210 
211 int
212 diff_atom_same(bool *same,
213 	       const struct diff_atom *left,
214 	       const struct diff_atom *right)
215 {
216 	int cmp;
217 	int r;
218 	if (left->hash != right->hash) {
219 		*same = false;
220 		return 0;
221 	}
222 	r = diff_atom_cmp(&cmp, left, right);
223 	if (r) {
224 		*same = true;
225 		return r;
226 	}
227 	*same = (cmp == 0);
228 	return 0;
229 }
230 
231 static struct diff_chunk *
232 diff_state_add_solved_chunk(struct diff_state *state,
233 			    const struct diff_chunk *chunk)
234 {
235 	diff_chunk_arraylist_t *result;
236 	struct diff_chunk *new_chunk;
237 	enum diff_chunk_type last_t;
238 	enum diff_chunk_type new_t;
239 	struct diff_chunk *last;
240 
241 	/* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
242 	 * never directly follows a plus chunk. */
243 	result = &state->result->chunks;
244 
245 	last_t = result->len ? diff_chunk_type(&result->head[result->len - 1])
246 		: CHUNK_EMPTY;
247 	new_t = diff_chunk_type(chunk);
248 
249 	debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED",
250 	      result->len);
251 	debug("L\n");
252 	debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
253 	debug("R\n");
254 	debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
255 
256 	if (result->len) {
257 		last = &result->head[result->len - 1];
258 		assert(chunk->left_start
259 		       == last->left_start + last->left_count);
260 		assert(chunk->right_start
261 		       == last->right_start + last->right_count);
262 	}
263 
264 	if (new_t == last_t) {
265 		new_chunk = &result->head[result->len - 1];
266 		new_chunk->left_count += chunk->left_count;
267 		new_chunk->right_count += chunk->right_count;
268 		debug("  - added chunk touches previous one of same type, joined:\n");
269 		debug("L\n");
270 		debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
271 		debug("R\n");
272 		debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
273 	} else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
274 		enum diff_chunk_type prev_last_t =
275 			result->len > 1 ?
276 				diff_chunk_type(&result->head[result->len - 2])
277 				: CHUNK_EMPTY;
278 		/* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
279 		 * Is the one before that also a minus? combine. */
280 		if (prev_last_t == CHUNK_MINUS) {
281 			new_chunk = &result->head[result->len - 2];
282 			new_chunk->left_count += chunk->left_count;
283 			new_chunk->right_count += chunk->right_count;
284 
285 			debug("  - added minus-chunk follows plus-chunk,"
286 			      " put before that plus-chunk and joined"
287 			      " with preceding minus-chunk:\n");
288 			debug("L\n");
289 			debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
290 			debug("R\n");
291 			debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
292 		} else {
293 			ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
294 			if (!new_chunk)
295 				return NULL;
296 			*new_chunk = *chunk;
297 
298 			/* The new minus chunk indicates to which position on
299 			 * the right it corresponds, even though it doesn't add
300 			 * any lines on the right. By moving above a plus chunk,
301 			 * that position on the right has shifted. */
302 			last = &result->head[result->len - 1];
303 			new_chunk->right_start = last->right_start;
304 
305 			debug("  - added minus-chunk follows plus-chunk,"
306 			      " put before that plus-chunk\n");
307 		}
308 
309 		/* That last_t == CHUNK_PLUS indicates to which position on the
310 		 * left it corresponds, even though it doesn't add any lines on
311 		 * the left. By inserting/extending the prev_last_t ==
312 		 * CHUNK_MINUS, that position on the left has shifted. */
313 		last = &result->head[result->len - 1];
314 		last->left_start = new_chunk->left_start
315 			+ new_chunk->left_count;
316 
317 	} else {
318 		ARRAYLIST_ADD(new_chunk, *result);
319 		if (!new_chunk)
320 			return NULL;
321 		*new_chunk = *chunk;
322 	}
323 	return new_chunk;
324 }
325 
326 /* Even if a left or right side is empty, diff output may need to know the
327  * position in that file.
328  * So left_start or right_start must never be NULL -- pass left_count or
329  * right_count as zero to indicate staying at that position without consuming
330  * any lines. */
331 struct diff_chunk *
332 diff_state_add_chunk(struct diff_state *state, bool solved,
333 		     struct diff_atom *left_start, unsigned int left_count,
334 		     struct diff_atom *right_start, unsigned int right_count)
335 {
336 	struct diff_chunk *new_chunk;
337 	struct diff_chunk chunk = {
338 		.solved = solved,
339 		.left_start = left_start,
340 		.left_count = left_count,
341 		.right_start = right_start,
342 		.right_count = right_count,
343 	};
344 
345 	/* An unsolved chunk means store as intermediate result for later
346 	 * re-iteration.
347 	 * If there already are intermediate results, that means even a
348 	 * following solved chunk needs to go to intermediate results, so that
349 	 * it is later put in the final correct position in solved chunks.
350 	 */
351 	if (!solved || state->temp_result.len) {
352 		/* Append to temp_result */
353 		debug("ADD %s chunk to temp result:\n",
354 		      chunk.solved ? "solved" : "UNSOLVED");
355 		debug("L\n");
356 		debug_dump_atoms(&state->left, left_start, left_count);
357 		debug("R\n");
358 		debug_dump_atoms(&state->right, right_start, right_count);
359 		ARRAYLIST_ADD(new_chunk, state->temp_result);
360 		if (!new_chunk)
361 			return NULL;
362 		*new_chunk = chunk;
363 		return new_chunk;
364 	}
365 
366 	return diff_state_add_solved_chunk(state, &chunk);
367 }
368 
369 static void
370 diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
371     unsigned long long len, int diff_flags)
372 {
373 	*d = (struct diff_data){
374 		.f = f,
375 		.pos = 0,
376 		.data = data,
377 		.len = len,
378 		.root = d,
379 		.diff_flags = diff_flags,
380 	};
381 }
382 
383 void
384 diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
385 			  struct diff_atom *from_atom, unsigned int atoms_count)
386 {
387 	struct diff_atom *last_atom;
388 
389 	debug("diff_data %p  parent %p  from_atom %p  atoms_count %u\n",
390 	      d, parent, from_atom, atoms_count);
391 	debug("  from_atom ");
392 	debug_dump_atom(parent, NULL, from_atom);
393 
394 	if (atoms_count == 0) {
395 		*d = (struct diff_data){
396 			.f = NULL,
397 			.pos = 0,
398 			.data = NULL,
399 			.len = 0,
400 			.root = parent->root,
401 			.atoms.head = NULL,
402 			.atoms.len = atoms_count,
403 		};
404 
405 		return;
406 	}
407 
408 	last_atom = from_atom + atoms_count - 1;
409 	*d = (struct diff_data){
410 		.f = NULL,
411 		.pos = from_atom->pos,
412 		.data = from_atom->at,
413 		.len = (last_atom->pos + last_atom->len) - from_atom->pos,
414 		.root = parent->root,
415 		.atoms.head = from_atom,
416 		.atoms.len = atoms_count,
417 	};
418 
419 	debug("subsection:\n");
420 	debug_dump(d);
421 }
422 
423 void
424 diff_data_free(struct diff_data *diff_data)
425 {
426 	if (!diff_data)
427 		return;
428 	if (diff_data->atoms.allocated)
429 		ARRAYLIST_FREE(diff_data->atoms);
430 }
431 
432 int
433 diff_algo_none(const struct diff_algo_config *algo_config,
434 	       struct diff_state *state)
435 {
436 	debug("\n** %s\n", __func__);
437 	debug("left:\n");
438 	debug_dump(&state->left);
439 	debug("right:\n");
440 	debug_dump(&state->right);
441 	debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
442 			       0);
443 
444 	/* Add a chunk of equal lines, if any */
445 	struct diff_atom *l = state->left.atoms.head;
446 	unsigned int l_len = state->left.atoms.len;
447 	struct diff_atom *r = state->right.atoms.head;
448 	unsigned int r_len = state->right.atoms.len;
449 	unsigned int equal_atoms_start = 0;
450 	unsigned int equal_atoms_end = 0;
451 	unsigned int l_idx = 0;
452 	unsigned int r_idx = 0;
453 
454 	while (equal_atoms_start < l_len
455 	       && equal_atoms_start < r_len) {
456 		int err;
457 		bool same;
458 		err = diff_atom_same(&same, &l[equal_atoms_start],
459 				     &r[equal_atoms_start]);
460 		if (err)
461 			return err;
462 		if (!same)
463 			break;
464 		equal_atoms_start++;
465 	}
466 	while (equal_atoms_end < (l_len - equal_atoms_start)
467 	       && equal_atoms_end < (r_len - equal_atoms_start)) {
468 		int err;
469 		bool same;
470 		err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
471 				   &r[r_len - 1 - equal_atoms_end]);
472 		if (err)
473 			return err;
474 		if (!same)
475 			break;
476 		equal_atoms_end++;
477 	}
478 
479 	/* Add a chunk of equal lines at the start */
480 	if (equal_atoms_start) {
481 		if (!diff_state_add_chunk(state, true,
482 					  l, equal_atoms_start,
483 					  r, equal_atoms_start))
484 			return ENOMEM;
485 		l_idx += equal_atoms_start;
486 		r_idx += equal_atoms_start;
487 	}
488 
489 	/* Add a "minus" chunk with all lines from the left. */
490 	if (equal_atoms_start + equal_atoms_end < l_len) {
491 		unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
492 		if (!diff_state_add_chunk(state, true,
493 					  &l[l_idx], add_len,
494 					  &r[r_idx], 0))
495 			return ENOMEM;
496 		l_idx += add_len;
497 	}
498 
499 	/* Add a "plus" chunk with all lines from the right. */
500 	if (equal_atoms_start + equal_atoms_end < r_len) {
501 		unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
502 		if (!diff_state_add_chunk(state, true,
503 					  &l[l_idx], 0,
504 					  &r[r_idx], add_len))
505 			return ENOMEM;
506 		r_idx += add_len;
507 	}
508 
509 	/* Add a chunk of equal lines at the end */
510 	if (equal_atoms_end) {
511 		if (!diff_state_add_chunk(state, true,
512 					  &l[l_idx], equal_atoms_end,
513 					  &r[r_idx], equal_atoms_end))
514 			return ENOMEM;
515 	}
516 
517 	return DIFF_RC_OK;
518 }
519 
520 static int
521 diff_run_algo(const struct diff_algo_config *algo_config,
522 	      struct diff_state *state)
523 {
524 	int rc;
525 
526 	if (!algo_config || !algo_config->impl
527 	    || !state->recursion_depth_left
528 	    || !state->left.atoms.len || !state->right.atoms.len) {
529 		debug("Fall back to diff_algo_none():%s%s%s\n",
530 		      (!algo_config || !algo_config->impl) ? " no-cfg" : "",
531 		      (!state->recursion_depth_left) ? " max-depth" : "",
532 		      (!state->left.atoms.len || !state->right.atoms.len)?
533 			      " trivial" : "");
534 		return diff_algo_none(algo_config, state);
535 	}
536 
537 	ARRAYLIST_FREE(state->temp_result);
538 	ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
539 	rc = algo_config->impl(algo_config, state);
540 	switch (rc) {
541 	case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
542 		debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
543 		      algo_config->fallback_algo);
544 		rc = diff_run_algo(algo_config->fallback_algo, state);
545 		goto return_rc;
546 
547 	case DIFF_RC_OK:
548 		/* continue below */
549 		break;
550 
551 	default:
552 		/* some error happened */
553 		goto return_rc;
554 	}
555 
556 	/* Pick up any diff chunks that are still unsolved and feed to
557 	 * inner_algo.  inner_algo will solve unsolved chunks and append to
558 	 * result, and subsequent solved chunks on this level are then appended
559 	 * to result afterwards. */
560 	int i;
561 	for (i = 0; i < state->temp_result.len; i++) {
562 		struct diff_chunk *c = &state->temp_result.head[i];
563 		if (c->solved) {
564 			diff_state_add_solved_chunk(state, c);
565 			continue;
566 		}
567 
568 		/* c is an unsolved chunk, feed to inner_algo */
569 		struct diff_state inner_state = {
570 			.result = state->result,
571 			.recursion_depth_left = state->recursion_depth_left - 1,
572 			.kd_buf = state->kd_buf,
573 			.kd_buf_size = state->kd_buf_size,
574 		};
575 		diff_data_init_subsection(&inner_state.left, &state->left,
576 					  c->left_start, c->left_count);
577 		diff_data_init_subsection(&inner_state.right, &state->right,
578 					  c->right_start, c->right_count);
579 
580 		rc = diff_run_algo(algo_config->inner_algo, &inner_state);
581 		state->kd_buf = inner_state.kd_buf;
582 		state->kd_buf_size = inner_state.kd_buf_size;
583 		if (rc != DIFF_RC_OK)
584 			goto return_rc;
585 	}
586 
587 	rc = DIFF_RC_OK;
588 return_rc:
589 	ARRAYLIST_FREE(state->temp_result);
590 	return rc;
591 }
592 
593 int
594 diff_atomize_file(struct diff_data *d,
595 		  const struct diff_config *config,
596 		  FILE *f, const uint8_t *data, off_t len, int diff_flags)
597 {
598 	if (!config->atomize_func)
599 		return EINVAL;
600 
601 	diff_data_init_root(d, f, data, len, diff_flags);
602 
603 	return config->atomize_func(config->atomize_func_data, d);
604 
605 }
606 
607 struct diff_result *
608 diff_main(const struct diff_config *config, struct diff_data *left,
609 	  struct diff_data *right)
610 {
611 	struct diff_result *result = malloc(sizeof(struct diff_result));
612 	if (!result)
613 		return NULL;
614 
615 	*result = (struct diff_result){};
616 	result->left = left;
617 	result->right = right;
618 
619 	struct diff_state state = {
620 		.result = result,
621 		.recursion_depth_left = config->max_recursion_depth ?
622 		    config->max_recursion_depth : UINT_MAX,
623 		.kd_buf = NULL,
624 		.kd_buf_size = 0,
625 	};
626 	diff_data_init_subsection(&state.left, left,
627 				  left->atoms.head,
628 				  left->atoms.len);
629 	diff_data_init_subsection(&state.right, right,
630 				  right->atoms.head,
631 				  right->atoms.len);
632 
633 	result->rc = diff_run_algo(config->algo, &state);
634 	free(state.kd_buf);
635 
636 	return result;
637 }
638 
639 void
640 diff_result_free(struct diff_result *result)
641 {
642 	if (!result)
643 		return;
644 	ARRAYLIST_FREE(result->chunks);
645 	free(result);
646 }
647 
648 int
649 diff_result_contains_printable_chunks(struct diff_result *result)
650 {
651 	struct diff_chunk *c;
652 	enum diff_chunk_type t;
653 	int i;
654 
655 	for (i = 0; i < result->chunks.len; i++) {
656 		c = &result->chunks.head[i];
657 		t = diff_chunk_type(c);
658 		if (t == CHUNK_MINUS || t == CHUNK_PLUS)
659 			return 1;
660 	}
661 
662 	return 0;
663 }
664