xref: /freebsd/contrib/libdiff/lib/diff_main.c (revision 020d003c86367bb5751b6d58fb58611242802c7f)
1  /* Generic infrastructure to implement various diff algorithms (implementation). */
2  /*
3   * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4   *
5   * Permission to use, copy, modify, and distribute this software for any
6   * purpose with or without fee is hereby granted, provided that the above
7   * copyright notice and this permission notice appear in all copies.
8   *
9   * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10   * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11   * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12   * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13   * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14   * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15   * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16   */
17  
18  
19  #include <sys/queue.h>
20  #include <ctype.h>
21  #include <errno.h>
22  #include <stdint.h>
23  #include <stdlib.h>
24  #include <stdbool.h>
25  #include <stdio.h>
26  #include <string.h>
27  #include <limits.h>
28  #include <unistd.h>
29  
30  #include <assert.h>
31  
32  #include <arraylist.h>
33  #include <diff_main.h>
34  
35  #include "diff_internal.h"
36  #include "diff_debug.h"
37  
38  inline enum diff_chunk_type
39  diff_chunk_type(const struct diff_chunk *chunk)
40  {
41  	if (!chunk->left_count && !chunk->right_count)
42  		return CHUNK_EMPTY;
43  	if (!chunk->solved)
44  		return CHUNK_ERROR;
45  	if (!chunk->right_count)
46  		return CHUNK_MINUS;
47  	if (!chunk->left_count)
48  		return CHUNK_PLUS;
49  	if (chunk->left_count != chunk->right_count)
50  		return CHUNK_ERROR;
51  	return CHUNK_SAME;
52  }
53  
54  static int
55  read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
56  {
57  	ssize_t r;
58  
59  	if (fseeko(f, at_pos, SEEK_SET) == -1)
60  		return errno;
61  	r = fread(buf, sizeof(char), len, f);
62  	if ((r == 0 || r < len) && ferror(f))
63  		return EIO;
64  	if (r != len)
65  		return EIO;
66  	return 0;
67  }
68  
69  static int
70  buf_cmp(const unsigned char *left, size_t left_len,
71  	const unsigned char *right, size_t right_len,
72  	bool ignore_whitespace)
73  {
74  	int cmp;
75  
76  	if (ignore_whitespace) {
77  		int il = 0, ir = 0;
78  		while (il < left_len && ir < right_len) {
79  			unsigned char cl = left[il];
80  			unsigned char cr = right[ir];
81  
82  			if (isspace((unsigned char)cl) && il < left_len) {
83  				il++;
84  				continue;
85  			}
86  			if (isspace((unsigned char)cr) && ir < right_len) {
87  				ir++;
88  				continue;
89  			}
90  
91  			if (cl > cr)
92  				return 1;
93  			if (cr > cl)
94  				return -1;
95  			il++;
96  			ir++;
97  		}
98  		while (il < left_len) {
99  			unsigned char cl = left[il++];
100  			if (!isspace((unsigned char)cl))
101  				return 1;
102  		}
103  		while (ir < right_len) {
104  			unsigned char cr = right[ir++];
105  			if (!isspace((unsigned char)cr))
106  				return -1;
107  		}
108  
109  		return 0;
110  	}
111  
112  	cmp = memcmp(left, right, MIN(left_len, right_len));
113  	if (cmp)
114  		return cmp;
115  	if (left_len == right_len)
116  		return 0;
117  	return (left_len > right_len) ? 1 : -1;
118  }
119  
120  int
121  diff_atom_cmp(int *cmp,
122  	      const struct diff_atom *left,
123  	      const struct diff_atom *right)
124  {
125  	off_t remain_left, remain_right;
126  	int flags = (left->root->diff_flags | right->root->diff_flags);
127  	bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
128  
129  	if (!left->len && !right->len) {
130  		*cmp = 0;
131  		return 0;
132  	}
133  	if (!ignore_whitespace) {
134  		if (!right->len) {
135  			*cmp = 1;
136  			return 0;
137  		}
138  		if (!left->len) {
139  			*cmp = -1;
140  			return 0;
141  		}
142  	}
143  
144  	if (left->at != NULL && right->at != NULL) {
145  		*cmp = buf_cmp(left->at, left->len, right->at, right->len,
146  		    ignore_whitespace);
147  		return 0;
148  	}
149  
150  	remain_left = left->len;
151  	remain_right = right->len;
152  	while (remain_left > 0 || remain_right > 0) {
153  		const size_t chunksz = 8192;
154  		unsigned char buf_left[chunksz], buf_right[chunksz];
155  		const uint8_t *p_left, *p_right;
156  		off_t n_left, n_right;
157  		int r;
158  
159  		if (!remain_right) {
160  			*cmp = 1;
161  			return 0;
162  		}
163  		if (!remain_left) {
164  			*cmp = -1;
165  			return 0;
166  		}
167  
168  		n_left = MIN(chunksz, remain_left);
169  		n_right = MIN(chunksz, remain_right);
170  
171  		if (left->at == NULL) {
172  			r = read_at(left->root->f,
173  				    left->pos + (left->len - remain_left),
174  				    buf_left, n_left);
175  			if (r) {
176  				*cmp = 0;
177  				return r;
178  			}
179  			p_left = buf_left;
180  		} else {
181  			p_left = left->at + (left->len - remain_left);
182  		}
183  
184  		if (right->at == NULL) {
185  			r = read_at(right->root->f,
186  				    right->pos + (right->len - remain_right),
187  				    buf_right, n_right);
188  			if (r) {
189  				*cmp = 0;
190  				return r;
191  			}
192  			p_right = buf_right;
193  		} else {
194  			p_right = right->at + (right->len - remain_right);
195  		}
196  
197  		r = buf_cmp(p_left, n_left, p_right, n_right,
198  		    ignore_whitespace);
199  		if (r) {
200  			*cmp = r;
201  			return 0;
202  		}
203  
204  		remain_left -= n_left;
205  		remain_right -= n_right;
206  	}
207  
208  	*cmp = 0;
209  	return 0;
210  }
211  
212  int
213  diff_atom_same(bool *same,
214  	       const struct diff_atom *left,
215  	       const struct diff_atom *right)
216  {
217  	int cmp;
218  	int r;
219  	if (left->hash != right->hash) {
220  		*same = false;
221  		return 0;
222  	}
223  	r = diff_atom_cmp(&cmp, left, right);
224  	if (r) {
225  		*same = true;
226  		return r;
227  	}
228  	*same = (cmp == 0);
229  	return 0;
230  }
231  
232  static struct diff_chunk *
233  diff_state_add_solved_chunk(struct diff_state *state,
234  			    const struct diff_chunk *chunk)
235  {
236  	diff_chunk_arraylist_t *result;
237  	struct diff_chunk *new_chunk;
238  	enum diff_chunk_type last_t;
239  	enum diff_chunk_type new_t;
240  	struct diff_chunk *last;
241  
242  	/* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
243  	 * never directly follows a plus chunk. */
244  	result = &state->result->chunks;
245  
246  	last_t = result->len ? diff_chunk_type(&result->head[result->len - 1])
247  		: CHUNK_EMPTY;
248  	new_t = diff_chunk_type(chunk);
249  
250  	debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED",
251  	      result->len);
252  	debug("L\n");
253  	debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
254  	debug("R\n");
255  	debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
256  
257  	if (result->len) {
258  		last = &result->head[result->len - 1];
259  		assert(chunk->left_start
260  		       == last->left_start + last->left_count);
261  		assert(chunk->right_start
262  		       == last->right_start + last->right_count);
263  	}
264  
265  	if (new_t == last_t) {
266  		new_chunk = &result->head[result->len - 1];
267  		new_chunk->left_count += chunk->left_count;
268  		new_chunk->right_count += chunk->right_count;
269  		debug("  - added chunk touches previous one of same type, joined:\n");
270  		debug("L\n");
271  		debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
272  		debug("R\n");
273  		debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
274  	} else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
275  		enum diff_chunk_type prev_last_t =
276  			result->len > 1 ?
277  				diff_chunk_type(&result->head[result->len - 2])
278  				: CHUNK_EMPTY;
279  		/* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
280  		 * Is the one before that also a minus? combine. */
281  		if (prev_last_t == CHUNK_MINUS) {
282  			new_chunk = &result->head[result->len - 2];
283  			new_chunk->left_count += chunk->left_count;
284  			new_chunk->right_count += chunk->right_count;
285  
286  			debug("  - added minus-chunk follows plus-chunk,"
287  			      " put before that plus-chunk and joined"
288  			      " with preceding minus-chunk:\n");
289  			debug("L\n");
290  			debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
291  			debug("R\n");
292  			debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
293  		} else {
294  			ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
295  			if (!new_chunk)
296  				return NULL;
297  			*new_chunk = *chunk;
298  
299  			/* The new minus chunk indicates to which position on
300  			 * the right it corresponds, even though it doesn't add
301  			 * any lines on the right. By moving above a plus chunk,
302  			 * that position on the right has shifted. */
303  			last = &result->head[result->len - 1];
304  			new_chunk->right_start = last->right_start;
305  
306  			debug("  - added minus-chunk follows plus-chunk,"
307  			      " put before that plus-chunk\n");
308  		}
309  
310  		/* That last_t == CHUNK_PLUS indicates to which position on the
311  		 * left it corresponds, even though it doesn't add any lines on
312  		 * the left. By inserting/extending the prev_last_t ==
313  		 * CHUNK_MINUS, that position on the left has shifted. */
314  		last = &result->head[result->len - 1];
315  		last->left_start = new_chunk->left_start
316  			+ new_chunk->left_count;
317  
318  	} else {
319  		ARRAYLIST_ADD(new_chunk, *result);
320  		if (!new_chunk)
321  			return NULL;
322  		*new_chunk = *chunk;
323  	}
324  	return new_chunk;
325  }
326  
327  /* Even if a left or right side is empty, diff output may need to know the
328   * position in that file.
329   * So left_start or right_start must never be NULL -- pass left_count or
330   * right_count as zero to indicate staying at that position without consuming
331   * any lines. */
332  struct diff_chunk *
333  diff_state_add_chunk(struct diff_state *state, bool solved,
334  		     struct diff_atom *left_start, unsigned int left_count,
335  		     struct diff_atom *right_start, unsigned int right_count)
336  {
337  	struct diff_chunk *new_chunk;
338  	struct diff_chunk chunk = {
339  		.solved = solved,
340  		.left_start = left_start,
341  		.left_count = left_count,
342  		.right_start = right_start,
343  		.right_count = right_count,
344  	};
345  
346  	/* An unsolved chunk means store as intermediate result for later
347  	 * re-iteration.
348  	 * If there already are intermediate results, that means even a
349  	 * following solved chunk needs to go to intermediate results, so that
350  	 * it is later put in the final correct position in solved chunks.
351  	 */
352  	if (!solved || state->temp_result.len) {
353  		/* Append to temp_result */
354  		debug("ADD %s chunk to temp result:\n",
355  		      chunk.solved ? "solved" : "UNSOLVED");
356  		debug("L\n");
357  		debug_dump_atoms(&state->left, left_start, left_count);
358  		debug("R\n");
359  		debug_dump_atoms(&state->right, right_start, right_count);
360  		ARRAYLIST_ADD(new_chunk, state->temp_result);
361  		if (!new_chunk)
362  			return NULL;
363  		*new_chunk = chunk;
364  		return new_chunk;
365  	}
366  
367  	return diff_state_add_solved_chunk(state, &chunk);
368  }
369  
370  static void
371  diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
372      unsigned long long len, int diff_flags)
373  {
374  	*d = (struct diff_data){
375  		.f = f,
376  		.pos = 0,
377  		.data = data,
378  		.len = len,
379  		.root = d,
380  		.diff_flags = diff_flags,
381  	};
382  }
383  
384  void
385  diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
386  			  struct diff_atom *from_atom, unsigned int atoms_count)
387  {
388  	struct diff_atom *last_atom;
389  
390  	debug("diff_data %p  parent %p  from_atom %p  atoms_count %u\n",
391  	      d, parent, from_atom, atoms_count);
392  	debug("  from_atom ");
393  	debug_dump_atom(parent, NULL, from_atom);
394  
395  	if (atoms_count == 0) {
396  		*d = (struct diff_data){
397  			.f = NULL,
398  			.pos = 0,
399  			.data = NULL,
400  			.len = 0,
401  			.root = parent->root,
402  			.atoms.head = NULL,
403  			.atoms.len = atoms_count,
404  		};
405  
406  		return;
407  	}
408  
409  	last_atom = from_atom + atoms_count - 1;
410  	*d = (struct diff_data){
411  		.f = NULL,
412  		.pos = from_atom->pos,
413  		.data = from_atom->at,
414  		.len = (last_atom->pos + last_atom->len) - from_atom->pos,
415  		.root = parent->root,
416  		.atoms.head = from_atom,
417  		.atoms.len = atoms_count,
418  	};
419  
420  	debug("subsection:\n");
421  	debug_dump(d);
422  }
423  
424  void
425  diff_data_free(struct diff_data *diff_data)
426  {
427  	if (!diff_data)
428  		return;
429  	if (diff_data->atoms.allocated)
430  		ARRAYLIST_FREE(diff_data->atoms);
431  }
432  
433  int
434  diff_algo_none(const struct diff_algo_config *algo_config,
435  	       struct diff_state *state)
436  {
437  	debug("\n** %s\n", __func__);
438  	debug("left:\n");
439  	debug_dump(&state->left);
440  	debug("right:\n");
441  	debug_dump(&state->right);
442  	debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
443  			       0);
444  
445  	/* Add a chunk of equal lines, if any */
446  	struct diff_atom *l = state->left.atoms.head;
447  	unsigned int l_len = state->left.atoms.len;
448  	struct diff_atom *r = state->right.atoms.head;
449  	unsigned int r_len = state->right.atoms.len;
450  	unsigned int equal_atoms_start = 0;
451  	unsigned int equal_atoms_end = 0;
452  	unsigned int l_idx = 0;
453  	unsigned int r_idx = 0;
454  
455  	while (equal_atoms_start < l_len
456  	       && equal_atoms_start < r_len) {
457  		int err;
458  		bool same;
459  		err = diff_atom_same(&same, &l[equal_atoms_start],
460  				     &r[equal_atoms_start]);
461  		if (err)
462  			return err;
463  		if (!same)
464  			break;
465  		equal_atoms_start++;
466  	}
467  	while (equal_atoms_end < (l_len - equal_atoms_start)
468  	       && equal_atoms_end < (r_len - equal_atoms_start)) {
469  		int err;
470  		bool same;
471  		err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
472  				   &r[r_len - 1 - equal_atoms_end]);
473  		if (err)
474  			return err;
475  		if (!same)
476  			break;
477  		equal_atoms_end++;
478  	}
479  
480  	/* Add a chunk of equal lines at the start */
481  	if (equal_atoms_start) {
482  		if (!diff_state_add_chunk(state, true,
483  					  l, equal_atoms_start,
484  					  r, equal_atoms_start))
485  			return ENOMEM;
486  		l_idx += equal_atoms_start;
487  		r_idx += equal_atoms_start;
488  	}
489  
490  	/* Add a "minus" chunk with all lines from the left. */
491  	if (equal_atoms_start + equal_atoms_end < l_len) {
492  		unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
493  		if (!diff_state_add_chunk(state, true,
494  					  &l[l_idx], add_len,
495  					  &r[r_idx], 0))
496  			return ENOMEM;
497  		l_idx += add_len;
498  	}
499  
500  	/* Add a "plus" chunk with all lines from the right. */
501  	if (equal_atoms_start + equal_atoms_end < r_len) {
502  		unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
503  		if (!diff_state_add_chunk(state, true,
504  					  &l[l_idx], 0,
505  					  &r[r_idx], add_len))
506  			return ENOMEM;
507  		r_idx += add_len;
508  	}
509  
510  	/* Add a chunk of equal lines at the end */
511  	if (equal_atoms_end) {
512  		if (!diff_state_add_chunk(state, true,
513  					  &l[l_idx], equal_atoms_end,
514  					  &r[r_idx], equal_atoms_end))
515  			return ENOMEM;
516  	}
517  
518  	return DIFF_RC_OK;
519  }
520  
521  static int
522  diff_run_algo(const struct diff_algo_config *algo_config,
523  	      struct diff_state *state)
524  {
525  	int rc;
526  
527  	if (!algo_config || !algo_config->impl
528  	    || !state->recursion_depth_left
529  	    || !state->left.atoms.len || !state->right.atoms.len) {
530  		debug("Fall back to diff_algo_none():%s%s%s\n",
531  		      (!algo_config || !algo_config->impl) ? " no-cfg" : "",
532  		      (!state->recursion_depth_left) ? " max-depth" : "",
533  		      (!state->left.atoms.len || !state->right.atoms.len)?
534  			      " trivial" : "");
535  		return diff_algo_none(algo_config, state);
536  	}
537  
538  	ARRAYLIST_FREE(state->temp_result);
539  	ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
540  	rc = algo_config->impl(algo_config, state);
541  	switch (rc) {
542  	case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
543  		debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
544  		      algo_config->fallback_algo);
545  		rc = diff_run_algo(algo_config->fallback_algo, state);
546  		goto return_rc;
547  
548  	case DIFF_RC_OK:
549  		/* continue below */
550  		break;
551  
552  	default:
553  		/* some error happened */
554  		goto return_rc;
555  	}
556  
557  	/* Pick up any diff chunks that are still unsolved and feed to
558  	 * inner_algo.  inner_algo will solve unsolved chunks and append to
559  	 * result, and subsequent solved chunks on this level are then appended
560  	 * to result afterwards. */
561  	int i;
562  	for (i = 0; i < state->temp_result.len; i++) {
563  		struct diff_chunk *c = &state->temp_result.head[i];
564  		if (c->solved) {
565  			diff_state_add_solved_chunk(state, c);
566  			continue;
567  		}
568  
569  		/* c is an unsolved chunk, feed to inner_algo */
570  		struct diff_state inner_state = {
571  			.result = state->result,
572  			.recursion_depth_left = state->recursion_depth_left - 1,
573  			.kd_buf = state->kd_buf,
574  			.kd_buf_size = state->kd_buf_size,
575  		};
576  		diff_data_init_subsection(&inner_state.left, &state->left,
577  					  c->left_start, c->left_count);
578  		diff_data_init_subsection(&inner_state.right, &state->right,
579  					  c->right_start, c->right_count);
580  
581  		rc = diff_run_algo(algo_config->inner_algo, &inner_state);
582  		state->kd_buf = inner_state.kd_buf;
583  		state->kd_buf_size = inner_state.kd_buf_size;
584  		if (rc != DIFF_RC_OK)
585  			goto return_rc;
586  	}
587  
588  	rc = DIFF_RC_OK;
589  return_rc:
590  	ARRAYLIST_FREE(state->temp_result);
591  	return rc;
592  }
593  
594  int
595  diff_atomize_file(struct diff_data *d,
596  		  const struct diff_config *config,
597  		  FILE *f, const uint8_t *data, off_t len, int diff_flags)
598  {
599  	if (!config->atomize_func)
600  		return EINVAL;
601  
602  	diff_data_init_root(d, f, data, len, diff_flags);
603  
604  	return config->atomize_func(config->atomize_func_data, d);
605  
606  }
607  
608  struct diff_result *
609  diff_main(const struct diff_config *config, struct diff_data *left,
610  	  struct diff_data *right)
611  {
612  	struct diff_result *result = malloc(sizeof(struct diff_result));
613  	if (!result)
614  		return NULL;
615  
616  	*result = (struct diff_result){};
617  	result->left = left;
618  	result->right = right;
619  
620  	struct diff_state state = {
621  		.result = result,
622  		.recursion_depth_left = config->max_recursion_depth ?
623  		    config->max_recursion_depth : UINT_MAX,
624  		.kd_buf = NULL,
625  		.kd_buf_size = 0,
626  	};
627  	diff_data_init_subsection(&state.left, left,
628  				  left->atoms.head,
629  				  left->atoms.len);
630  	diff_data_init_subsection(&state.right, right,
631  				  right->atoms.head,
632  				  right->atoms.len);
633  
634  	result->rc = diff_run_algo(config->algo, &state);
635  	free(state.kd_buf);
636  
637  	return result;
638  }
639  
640  void
641  diff_result_free(struct diff_result *result)
642  {
643  	if (!result)
644  		return;
645  	ARRAYLIST_FREE(result->chunks);
646  	free(result);
647  }
648  
649  int
650  diff_result_contains_printable_chunks(struct diff_result *result)
651  {
652  	struct diff_chunk *c;
653  	enum diff_chunk_type t;
654  	int i;
655  
656  	for (i = 0; i < result->chunks.len; i++) {
657  		c = &result->chunks.head[i];
658  		t = diff_chunk_type(c);
659  		if (t == CHUNK_MINUS || t == CHUNK_PLUS)
660  			return 1;
661  	}
662  
663  	return 0;
664  }
665