1 /* Split source by line breaks, and calculate a simplistic checksum. */ 2 /* 3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <errno.h> 19 #include <stdbool.h> 20 #include <stdint.h> 21 #include <stdio.h> 22 #include <stdlib.h> 23 #include <unistd.h> 24 #include <ctype.h> 25 26 #include <arraylist.h> 27 #include <diff_main.h> 28 29 #include "diff_internal.h" 30 #include "diff_debug.h" 31 32 unsigned int 33 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte) 34 { 35 return hash * 23 + atom_byte; 36 } 37 38 static int 39 diff_data_atomize_text_lines_fd(struct diff_data *d) 40 { 41 off_t pos = 0; 42 const off_t end = pos + d->len; 43 unsigned int array_size_estimate = d->len / 50; 44 unsigned int pow2 = 1; 45 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); 46 bool embedded_nul = false; 47 48 while (array_size_estimate >>= 1) 49 pow2++; 50 51 ARRAYLIST_INIT(d->atoms, 1 << pow2); 52 53 if (fseek(d->root->f, 0L, SEEK_SET) == -1) 54 return errno; 55 56 while (pos < end) { 57 off_t line_end = pos; 58 unsigned int hash = 0; 59 unsigned char buf[512]; 60 size_t r, i; 61 struct diff_atom *atom; 62 int eol = 0; 63 64 while (eol == 0 && line_end < end) { 65 r = fread(buf, sizeof(char), sizeof(buf), d->root->f); 66 if (r == 0 && ferror(d->root->f)) 67 return EIO; 68 i = 0; 69 while (eol == 0 && i < r) { 70 if (buf[i] != '\r' && buf[i] != '\n') { 71 if (!ignore_whitespace 72 || !isspace((unsigned char)buf[i])) 73 hash = diff_atom_hash_update( 74 hash, buf[i]); 75 if (buf[i] == '\0') 76 embedded_nul = true; 77 line_end++; 78 } else 79 eol = buf[i]; 80 i++; 81 } 82 } 83 84 /* When not at the end of data, the line ending char ('\r' or 85 * '\n') must follow */ 86 if (line_end < end) 87 line_end++; 88 /* If that was an '\r', also pull in any following '\n' */ 89 if (line_end < end && eol == '\r') { 90 if (fseeko(d->root->f, line_end, SEEK_SET) == -1) 91 return errno; 92 r = fread(buf, sizeof(char), sizeof(buf), d->root->f); 93 if (r == 0 && ferror(d->root->f)) 94 return EIO; 95 if (r > 0 && buf[0] == '\n') 96 line_end++; 97 } 98 99 /* Record the found line as diff atom */ 100 ARRAYLIST_ADD(atom, d->atoms); 101 if (!atom) 102 return ENOMEM; 103 104 *atom = (struct diff_atom){ 105 .root = d, 106 .pos = pos, 107 .at = NULL, /* atom data is not memory-mapped */ 108 .len = line_end - pos, 109 .hash = hash, 110 }; 111 112 /* Starting point for next line: */ 113 pos = line_end; 114 if (fseeko(d->root->f, pos, SEEK_SET) == -1) 115 return errno; 116 } 117 118 /* File are considered binary if they contain embedded '\0' bytes. */ 119 if (embedded_nul) 120 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; 121 122 return DIFF_RC_OK; 123 } 124 125 static int 126 diff_data_atomize_text_lines_mmap(struct diff_data *d) 127 { 128 const uint8_t *pos = d->data; 129 const uint8_t *end = pos + d->len; 130 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); 131 bool embedded_nul = false; 132 unsigned int array_size_estimate = d->len / 50; 133 unsigned int pow2 = 1; 134 while (array_size_estimate >>= 1) 135 pow2++; 136 137 ARRAYLIST_INIT(d->atoms, 1 << pow2); 138 139 while (pos < end) { 140 const uint8_t *line_end = pos; 141 unsigned int hash = 0; 142 143 while (line_end < end && *line_end != '\r' && *line_end != '\n') { 144 if (!ignore_whitespace 145 || !isspace((unsigned char)*line_end)) 146 hash = diff_atom_hash_update(hash, *line_end); 147 if (*line_end == '\0') 148 embedded_nul = true; 149 line_end++; 150 } 151 152 /* When not at the end of data, the line ending char ('\r' or 153 * '\n') must follow */ 154 if (line_end < end && *line_end == '\r') 155 line_end++; 156 if (line_end < end && *line_end == '\n') 157 line_end++; 158 159 /* Record the found line as diff atom */ 160 struct diff_atom *atom; 161 ARRAYLIST_ADD(atom, d->atoms); 162 if (!atom) 163 return ENOMEM; 164 165 *atom = (struct diff_atom){ 166 .root = d, 167 .pos = (off_t)(pos - d->data), 168 .at = pos, 169 .len = line_end - pos, 170 .hash = hash, 171 }; 172 173 /* Starting point for next line: */ 174 pos = line_end; 175 } 176 177 /* File are considered binary if they contain embedded '\0' bytes. */ 178 if (embedded_nul) 179 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; 180 181 return DIFF_RC_OK; 182 } 183 184 static int 185 diff_data_atomize_text_lines(struct diff_data *d) 186 { 187 if (d->data == NULL) 188 return diff_data_atomize_text_lines_fd(d); 189 else 190 return diff_data_atomize_text_lines_mmap(d); 191 } 192 193 int 194 diff_atomize_text_by_line(void *func_data, struct diff_data *d) 195 { 196 return diff_data_atomize_text_lines(d); 197 } 198