1 /* Generic infrastructure to implement various diff algorithms. */ 2 /* 3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 struct diff_range { 19 int start; 20 int end; 21 }; 22 23 /* List of all possible return codes of a diff invocation. */ 24 #define DIFF_RC_USE_DIFF_ALGO_FALLBACK -1 25 #define DIFF_RC_OK 0 26 /* Any positive return values are errno values from sys/errno.h */ 27 28 struct diff_atom { 29 struct diff_data *root; /* back pointer to root diff data */ 30 31 off_t pos; /* set whether memory-mapped or not */ 32 const uint8_t *at; /* only set if memory-mapped */ 33 off_t len; 34 35 /* This hash is just a very cheap speed up for finding *mismatching* 36 * atoms. When hashes match, we still need to compare entire atoms to 37 * find out whether they are indeed identical or not. 38 * Calculated over all atom bytes with diff_atom_hash_update(). */ 39 unsigned int hash; 40 }; 41 42 /* Mix another atom_byte into the provided hash value and return the result. 43 * The hash value passed in for the first byte of the atom must be zero. */ 44 unsigned int 45 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte); 46 47 /* Compare two atoms for equality. Return 0 on success, or errno on failure. 48 * Set cmp to -1, 0, or 1, just like strcmp(). */ 49 int 50 diff_atom_cmp(int *cmp, 51 const struct diff_atom *left, 52 const struct diff_atom *right); 53 54 55 /* The atom's index in the entire file. For atoms divided by lines of text, this 56 * yields the line number (starting with 0). Also works for diff_data that 57 * reference only a subsection of a file, always reflecting the global position 58 * in the file (and not the relative position within the subsection). */ 59 #define diff_atom_root_idx(DIFF_DATA, ATOM) \ 60 ((ATOM) && ((ATOM) >= (DIFF_DATA)->root->atoms.head) \ 61 ? (unsigned int)((ATOM) - ((DIFF_DATA)->root->atoms.head)) \ 62 : (DIFF_DATA)->root->atoms.len) 63 64 /* The atom's index within DIFF_DATA. For atoms divided by lines of text, this 65 * yields the line number (starting with 0). */ 66 #define diff_atom_idx(DIFF_DATA, ATOM) \ 67 ((ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) \ 68 ? (unsigned int)((ATOM) - ((DIFF_DATA)->atoms.head)) \ 69 : (DIFF_DATA)->atoms.len) 70 71 #define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \ 72 for ((ATOM) = (FIRST_ATOM); \ 73 (ATOM) \ 74 && ((ATOM) >= (FIRST_ATOM)) \ 75 && ((ATOM) - (FIRST_ATOM) < (COUNT)); \ 76 (ATOM)++) 77 78 #define diff_data_foreach_atom(ATOM, DIFF_DATA) \ 79 foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len) 80 81 #define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \ 82 for ((ATOM) = (FROM); \ 83 (ATOM) \ 84 && ((ATOM) >= (DIFF_DATA)->atoms.head) \ 85 && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \ 86 (ATOM)++) 87 88 #define diff_data_foreach_atom_backwards_from(FROM, ATOM, DIFF_DATA) \ 89 for ((ATOM) = (FROM); \ 90 (ATOM) \ 91 && ((ATOM) >= (DIFF_DATA)->atoms.head) \ 92 && ((ATOM) - (DIFF_DATA)->atoms.head >= 0); \ 93 (ATOM)--) 94 95 /* For each file, there is a "root" struct diff_data referencing the entire 96 * file, which the atoms are parsed from. In recursion of diff algorithm, there 97 * may be "child" struct diff_data only referencing a subsection of the file, 98 * re-using the atoms parsing. For "root" structs, atoms_allocated will be 99 * nonzero, indicating that the array of atoms is owned by that struct. For 100 * "child" structs, atoms_allocated == 0, to indicate that the struct is 101 * referencing a subset of atoms. */ 102 struct diff_data { 103 FILE *f; /* if root diff_data and not memory-mapped */ 104 off_t pos; /* if not memory-mapped */ 105 const uint8_t *data; /* if memory-mapped */ 106 off_t len; 107 108 int atomizer_flags; 109 ARRAYLIST(struct diff_atom) atoms; 110 struct diff_data *root; 111 struct diff_data *current; 112 void *algo_data; 113 114 int diff_flags; 115 116 int err; 117 }; 118 119 /* Flags set by file atomizer. */ 120 #define DIFF_ATOMIZER_FOUND_BINARY_DATA 0x00000001 121 #define DIFF_ATOMIZER_FILE_TRUNCATED 0x00000002 122 123 /* Flags set by caller of diff_main(). */ 124 #define DIFF_FLAG_IGNORE_WHITESPACE 0x00000001 125 #define DIFF_FLAG_SHOW_PROTOTYPES 0x00000002 126 #define DIFF_FLAG_FORCE_TEXT_DATA 0x00000004 127 128 void diff_data_free(struct diff_data *diff_data); 129 130 struct diff_chunk; 131 typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t; 132 133 struct diff_result { 134 int rc; 135 136 /* 137 * Pointers to diff data passed in via diff_main. 138 * Do not free these diff_data before freeing the diff_result struct. 139 */ 140 struct diff_data *left; 141 struct diff_data *right; 142 143 diff_chunk_arraylist_t chunks; 144 }; 145 146 enum diff_chunk_type { 147 CHUNK_EMPTY, 148 CHUNK_PLUS, 149 CHUNK_MINUS, 150 CHUNK_SAME, 151 CHUNK_ERROR, 152 }; 153 154 enum diff_chunk_type diff_chunk_type(const struct diff_chunk *c); 155 156 struct diff_state; 157 158 /* Signature of a utility function to divide a file into diff atoms. 159 * An example is diff_atomize_text_by_line() in diff_atomize_text.c. 160 * 161 * func_data: context pointer (free to be used by implementation). 162 * d: struct diff_data with d->data and d->len already set up, and 163 * d->atoms to be created and d->atomizer_flags to be set up. 164 */ 165 typedef int (*diff_atomize_func_t)(void *func_data, struct diff_data *d); 166 167 extern int diff_atomize_text_by_line(void *func_data, struct diff_data *d); 168 169 struct diff_algo_config; 170 typedef int (*diff_algo_impl_t)( 171 const struct diff_algo_config *algo_config, struct diff_state *state); 172 173 /* Form a result with all left-side removed and all right-side added, i.e. no 174 * actual diff algorithm involved. */ 175 int diff_algo_none(const struct diff_algo_config *algo_config, 176 struct diff_state *state); 177 178 /* Myers Diff tracing from the start all the way through to the end, requiring 179 * quadratic amounts of memory. This can fail if the required space surpasses 180 * algo_config->permitted_state_size. */ 181 extern int diff_algo_myers(const struct diff_algo_config *algo_config, 182 struct diff_state *state); 183 184 /* Myers "Divide et Impera": tracing forwards from the start and backwards from 185 * the end to find a midpoint that divides the problem into smaller chunks. 186 * Requires only linear amounts of memory. */ 187 extern int diff_algo_myers_divide( 188 const struct diff_algo_config *algo_config, struct diff_state *state); 189 190 /* Patience Diff algorithm, which divides a larger diff into smaller chunks. For 191 * very specific scenarios, it may lead to a complete diff result by itself, but 192 * needs a fallback algo to solve chunks that don't have common-unique atoms. */ 193 extern int diff_algo_patience( 194 const struct diff_algo_config *algo_config, struct diff_state *state); 195 196 /* Diff algorithms to use, possibly nested. For example: 197 * 198 * struct diff_algo_config myers, patience, myers_divide; 199 * 200 * myers = (struct diff_algo_config){ 201 * .impl = diff_algo_myers, 202 * .permitted_state_size = 32 * 1024 * 1024, 203 * // When too large, do diff_algo_patience: 204 * .fallback_algo = &patience, 205 * }; 206 * 207 * const struct diff_algo_config patience = (struct diff_algo_config){ 208 * .impl = diff_algo_patience, 209 * // After subdivision, do Patience again: 210 * .inner_algo = &patience, 211 * // If subdivision failed, do Myers Divide et Impera: 212 * .fallback_algo = &myers_then_myers_divide, 213 * }; 214 * 215 * const struct diff_algo_config myers_divide = (struct diff_algo_config){ 216 * .impl = diff_algo_myers_divide, 217 * // When division succeeded, start from the top: 218 * .inner_algo = &myers_then_myers_divide, 219 * // (fallback_algo = NULL implies diff_algo_none). 220 * }; 221 * struct diff_config config = { 222 * .algo = &myers, 223 * ... 224 * }; 225 * diff_main(&config, ...); 226 */ 227 struct diff_algo_config { 228 diff_algo_impl_t impl; 229 230 /* Fail this algo if it would use more than this amount of memory, and 231 * instead use fallback_algo (diff_algo_myers). permitted_state_size == 232 * 0 means no limitation. */ 233 size_t permitted_state_size; 234 235 /* For algorithms that divide into smaller chunks, use this algorithm to 236 * solve the divided chunks. */ 237 const struct diff_algo_config *inner_algo; 238 239 /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large 240 * state, or diff_algo_patience can't find any common-unique atoms), 241 * then use this algorithm instead. */ 242 const struct diff_algo_config *fallback_algo; 243 }; 244 245 struct diff_config { 246 diff_atomize_func_t atomize_func; 247 void *atomize_func_data; 248 249 const struct diff_algo_config *algo; 250 251 /* How deep to step into subdivisions of a source file, a paranoia / 252 * safety measure to guard against infinite loops through diff 253 * algorithms. When the maximum recursion is reached, employ 254 * diff_algo_none (i.e. remove all left atoms and add all right atoms). 255 */ 256 unsigned int max_recursion_depth; 257 }; 258 259 int diff_atomize_file(struct diff_data *d, const struct diff_config *config, 260 FILE *f, const uint8_t *data, off_t len, int diff_flags); 261 struct diff_result *diff_main(const struct diff_config *config, 262 struct diff_data *left, 263 struct diff_data *right); 264 void diff_result_free(struct diff_result *result); 265 int diff_result_contains_printable_chunks(struct diff_result *result); 266