1 /* Generic infrastructure to implement various diff algorithms. */ 2 /* 3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 struct diff_range { 19 int start; 20 int end; 21 }; 22 23 /* List of all possible return codes of a diff invocation. */ 24 #define DIFF_RC_USE_DIFF_ALGO_FALLBACK -1 25 #define DIFF_RC_OK 0 26 /* Any positive return values are errno values from sys/errno.h */ 27 28 struct diff_atom { 29 struct diff_data *root; /* back pointer to root diff data */ 30 31 off_t pos; /* set whether memory-mapped or not */ 32 const uint8_t *at; /* only set if memory-mapped */ 33 off_t len; 34 35 /* This hash is just a very cheap speed up for finding *mismatching* 36 * atoms. When hashes match, we still need to compare entire atoms to 37 * find out whether they are indeed identical or not. 38 * Calculated over all atom bytes with diff_atom_hash_update(). */ 39 unsigned int hash; 40 }; 41 42 /* Mix another atom_byte into the provided hash value and return the result. 43 * The hash value passed in for the first byte of the atom must be zero. */ 44 unsigned int 45 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte); 46 47 /* Compare two atoms for equality. Return 0 on success, or errno on failure. 48 * Set cmp to -1, 0, or 1, just like strcmp(). */ 49 int 50 diff_atom_cmp(int *cmp, 51 const struct diff_atom *left, 52 const struct diff_atom *right); 53 54 55 /* The atom's index in the entire file. For atoms divided by lines of text, this 56 * yields the line number (starting with 0). Also works for diff_data that 57 * reference only a subsection of a file, always reflecting the global position 58 * in the file (and not the relative position within the subsection). */ 59 #define diff_atom_root_idx(DIFF_DATA, ATOM) \ 60 ((ATOM) && ((ATOM) >= (DIFF_DATA)->root->atoms.head) \ 61 ? (unsigned int)((ATOM) - ((DIFF_DATA)->root->atoms.head)) \ 62 : (DIFF_DATA)->root->atoms.len) 63 64 /* The atom's index within DIFF_DATA. For atoms divided by lines of text, this 65 * yields the line number (starting with 0). */ 66 #define diff_atom_idx(DIFF_DATA, ATOM) \ 67 ((ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) \ 68 ? (unsigned int)((ATOM) - ((DIFF_DATA)->atoms.head)) \ 69 : (DIFF_DATA)->atoms.len) 70 71 #define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \ 72 for ((ATOM) = (FIRST_ATOM); \ 73 (ATOM) \ 74 && ((ATOM) >= (FIRST_ATOM)) \ 75 && ((ATOM) - (FIRST_ATOM) < (COUNT)); \ 76 (ATOM)++) 77 78 #define diff_data_foreach_atom(ATOM, DIFF_DATA) \ 79 foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len) 80 81 #define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \ 82 for ((ATOM) = (FROM); \ 83 (ATOM) \ 84 && ((ATOM) >= (DIFF_DATA)->atoms.head) \ 85 && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \ 86 (ATOM)++) 87 88 #define diff_data_foreach_atom_backwards_from(FROM, ATOM, DIFF_DATA) \ 89 for ((ATOM) = (FROM); \ 90 (ATOM) \ 91 && ((ATOM) >= (DIFF_DATA)->atoms.head) \ 92 && ((ATOM) - (DIFF_DATA)->atoms.head >= 0); \ 93 (ATOM)--) 94 95 /* For each file, there is a "root" struct diff_data referencing the entire 96 * file, which the atoms are parsed from. In recursion of diff algorithm, there 97 * may be "child" struct diff_data only referencing a subsection of the file, 98 * re-using the atoms parsing. For "root" structs, atoms_allocated will be 99 * nonzero, indicating that the array of atoms is owned by that struct. For 100 * "child" structs, atoms_allocated == 0, to indicate that the struct is 101 * referencing a subset of atoms. */ 102 struct diff_data { 103 FILE *f; /* if root diff_data and not memory-mapped */ 104 off_t pos; /* if not memory-mapped */ 105 const uint8_t *data; /* if memory-mapped */ 106 off_t len; 107 108 int atomizer_flags; 109 ARRAYLIST(struct diff_atom) atoms; 110 struct diff_data *root; 111 struct diff_data *current; 112 void *algo_data; 113 114 int diff_flags; 115 116 int err; 117 }; 118 119 /* Flags set by file atomizer. */ 120 #define DIFF_ATOMIZER_FOUND_BINARY_DATA 0x00000001 121 122 /* Flags set by caller of diff_main(). */ 123 #define DIFF_FLAG_IGNORE_WHITESPACE 0x00000001 124 #define DIFF_FLAG_SHOW_PROTOTYPES 0x00000002 125 #define DIFF_FLAG_FORCE_TEXT_DATA 0x00000004 126 127 void diff_data_free(struct diff_data *diff_data); 128 129 struct diff_chunk; 130 typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t; 131 132 struct diff_result { 133 int rc; 134 135 /* 136 * Pointers to diff data passed in via diff_main. 137 * Do not free these diff_data before freeing the diff_result struct. 138 */ 139 struct diff_data *left; 140 struct diff_data *right; 141 142 diff_chunk_arraylist_t chunks; 143 }; 144 145 enum diff_chunk_type { 146 CHUNK_EMPTY, 147 CHUNK_PLUS, 148 CHUNK_MINUS, 149 CHUNK_SAME, 150 CHUNK_ERROR, 151 }; 152 153 enum diff_chunk_type diff_chunk_type(const struct diff_chunk *c); 154 155 struct diff_state; 156 157 /* Signature of a utility function to divide a file into diff atoms. 158 * An example is diff_atomize_text_by_line() in diff_atomize_text.c. 159 * 160 * func_data: context pointer (free to be used by implementation). 161 * d: struct diff_data with d->data and d->len already set up, and 162 * d->atoms to be created and d->atomizer_flags to be set up. 163 */ 164 typedef int (*diff_atomize_func_t)(void *func_data, struct diff_data *d); 165 166 extern int diff_atomize_text_by_line(void *func_data, struct diff_data *d); 167 168 struct diff_algo_config; 169 typedef int (*diff_algo_impl_t)( 170 const struct diff_algo_config *algo_config, struct diff_state *state); 171 172 /* Form a result with all left-side removed and all right-side added, i.e. no 173 * actual diff algorithm involved. */ 174 int diff_algo_none(const struct diff_algo_config *algo_config, 175 struct diff_state *state); 176 177 /* Myers Diff tracing from the start all the way through to the end, requiring 178 * quadratic amounts of memory. This can fail if the required space surpasses 179 * algo_config->permitted_state_size. */ 180 extern int diff_algo_myers(const struct diff_algo_config *algo_config, 181 struct diff_state *state); 182 183 /* Myers "Divide et Impera": tracing forwards from the start and backwards from 184 * the end to find a midpoint that divides the problem into smaller chunks. 185 * Requires only linear amounts of memory. */ 186 extern int diff_algo_myers_divide( 187 const struct diff_algo_config *algo_config, struct diff_state *state); 188 189 /* Patience Diff algorithm, which divides a larger diff into smaller chunks. For 190 * very specific scenarios, it may lead to a complete diff result by itself, but 191 * needs a fallback algo to solve chunks that don't have common-unique atoms. */ 192 extern int diff_algo_patience( 193 const struct diff_algo_config *algo_config, struct diff_state *state); 194 195 /* Diff algorithms to use, possibly nested. For example: 196 * 197 * struct diff_algo_config myers, patience, myers_divide; 198 * 199 * myers = (struct diff_algo_config){ 200 * .impl = diff_algo_myers, 201 * .permitted_state_size = 32 * 1024 * 1024, 202 * // When too large, do diff_algo_patience: 203 * .fallback_algo = &patience, 204 * }; 205 * 206 * const struct diff_algo_config patience = (struct diff_algo_config){ 207 * .impl = diff_algo_patience, 208 * // After subdivision, do Patience again: 209 * .inner_algo = &patience, 210 * // If subdivision failed, do Myers Divide et Impera: 211 * .fallback_algo = &myers_then_myers_divide, 212 * }; 213 * 214 * const struct diff_algo_config myers_divide = (struct diff_algo_config){ 215 * .impl = diff_algo_myers_divide, 216 * // When division succeeded, start from the top: 217 * .inner_algo = &myers_then_myers_divide, 218 * // (fallback_algo = NULL implies diff_algo_none). 219 * }; 220 * struct diff_config config = { 221 * .algo = &myers, 222 * ... 223 * }; 224 * diff_main(&config, ...); 225 */ 226 struct diff_algo_config { 227 diff_algo_impl_t impl; 228 229 /* Fail this algo if it would use more than this amount of memory, and 230 * instead use fallback_algo (diff_algo_myers). permitted_state_size == 231 * 0 means no limitation. */ 232 size_t permitted_state_size; 233 234 /* For algorithms that divide into smaller chunks, use this algorithm to 235 * solve the divided chunks. */ 236 const struct diff_algo_config *inner_algo; 237 238 /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large 239 * state, or diff_algo_patience can't find any common-unique atoms), 240 * then use this algorithm instead. */ 241 const struct diff_algo_config *fallback_algo; 242 }; 243 244 struct diff_config { 245 diff_atomize_func_t atomize_func; 246 void *atomize_func_data; 247 248 const struct diff_algo_config *algo; 249 250 /* How deep to step into subdivisions of a source file, a paranoia / 251 * safety measure to guard against infinite loops through diff 252 * algorithms. When the maximum recursion is reached, employ 253 * diff_algo_none (i.e. remove all left atoms and add all right atoms). 254 */ 255 unsigned int max_recursion_depth; 256 }; 257 258 int diff_atomize_file(struct diff_data *d, const struct diff_config *config, 259 FILE *f, const uint8_t *data, off_t len, int diff_flags); 260 struct diff_result *diff_main(const struct diff_config *config, 261 struct diff_data *left, 262 struct diff_data *right); 263 void diff_result_free(struct diff_result *result); 264 int diff_result_contains_printable_chunks(struct diff_result *result); 265