1 /* Split source by line breaks, and calculate a simplistic checksum. */ 2 /* 3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <errno.h> 19 #include <setjmp.h> 20 #include <signal.h> 21 #include <stdbool.h> 22 #include <stdint.h> 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include <unistd.h> 26 #include <ctype.h> 27 28 #include <arraylist.h> 29 #include <diff_main.h> 30 31 #include "diff_internal.h" 32 #include "diff_debug.h" 33 34 unsigned int 35 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte) 36 { 37 return hash * 23 + atom_byte; 38 } 39 40 static int 41 diff_data_atomize_text_lines_fd(struct diff_data *d) 42 { 43 off_t pos = 0; 44 const off_t end = pos + d->len; 45 unsigned int array_size_estimate = d->len / 50; 46 unsigned int pow2 = 1; 47 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); 48 bool embedded_nul = false; 49 50 while (array_size_estimate >>= 1) 51 pow2++; 52 53 ARRAYLIST_INIT(d->atoms, 1 << pow2); 54 55 if (fseek(d->root->f, 0L, SEEK_SET) == -1) 56 return errno; 57 58 while (pos < end) { 59 off_t line_end = pos; 60 unsigned int hash = 0; 61 unsigned char buf[512]; 62 size_t r, i; 63 struct diff_atom *atom; 64 int eol = 0; 65 66 while (eol == 0 && line_end < end) { 67 r = fread(buf, sizeof(char), sizeof(buf), d->root->f); 68 if (r == 0 && ferror(d->root->f)) 69 return EIO; 70 i = 0; 71 while (eol == 0 && i < r) { 72 if (buf[i] != '\r' && buf[i] != '\n') { 73 if (!ignore_whitespace 74 || !isspace((unsigned char)buf[i])) 75 hash = diff_atom_hash_update( 76 hash, buf[i]); 77 if (buf[i] == '\0') 78 embedded_nul = true; 79 line_end++; 80 } else 81 eol = buf[i]; 82 i++; 83 } 84 } 85 86 /* When not at the end of data, the line ending char ('\r' or 87 * '\n') must follow */ 88 if (line_end < end) 89 line_end++; 90 /* If that was an '\r', also pull in any following '\n' */ 91 if (line_end < end && eol == '\r') { 92 if (fseeko(d->root->f, line_end, SEEK_SET) == -1) 93 return errno; 94 r = fread(buf, sizeof(char), sizeof(buf), d->root->f); 95 if (r == 0 && ferror(d->root->f)) 96 return EIO; 97 if (r > 0 && buf[0] == '\n') 98 line_end++; 99 } 100 101 /* Record the found line as diff atom */ 102 ARRAYLIST_ADD(atom, d->atoms); 103 if (!atom) 104 return ENOMEM; 105 106 *atom = (struct diff_atom){ 107 .root = d, 108 .pos = pos, 109 .at = NULL, /* atom data is not memory-mapped */ 110 .len = line_end - pos, 111 .hash = hash, 112 }; 113 114 /* Starting point for next line: */ 115 pos = line_end; 116 if (fseeko(d->root->f, pos, SEEK_SET) == -1) 117 return errno; 118 } 119 120 /* File are considered binary if they contain embedded '\0' bytes. */ 121 if (embedded_nul) 122 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; 123 124 return DIFF_RC_OK; 125 } 126 127 static sigjmp_buf diff_data_signal_env; 128 static void 129 diff_data_signal_handler(int sig) 130 { 131 siglongjmp(diff_data_signal_env, sig); 132 } 133 134 static int 135 diff_data_atomize_text_lines_mmap(struct diff_data *d) 136 { 137 struct sigaction act, oact; 138 const uint8_t *volatile pos = d->data; 139 const uint8_t *end = pos + d->len; 140 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); 141 bool embedded_nul = false; 142 unsigned int array_size_estimate = d->len / 50; 143 unsigned int pow2 = 1; 144 while (array_size_estimate >>= 1) 145 pow2++; 146 147 ARRAYLIST_INIT(d->atoms, 1 << pow2); 148 149 sigemptyset(&act.sa_mask); 150 act.sa_flags = 0; 151 act.sa_handler = diff_data_signal_handler; 152 sigaction(SIGBUS, &act, &oact); 153 if (sigsetjmp(diff_data_signal_env, 0) > 0) { 154 /* 155 * The file was truncated while we were reading it. Set 156 * the end pointer to the beginning of the line we were 157 * trying to read, adjust the file length, and set a flag. 158 */ 159 end = pos; 160 d->len = end - d->data; 161 d->atomizer_flags |= DIFF_ATOMIZER_FILE_TRUNCATED; 162 } 163 while (pos < end) { 164 const uint8_t *line_start = pos, *line_end = pos; 165 unsigned int hash = 0; 166 167 while (line_end < end && *line_end != '\r' && *line_end != '\n') { 168 if (!ignore_whitespace 169 || !isspace((unsigned char)*line_end)) 170 hash = diff_atom_hash_update(hash, *line_end); 171 if (*line_end == '\0') 172 embedded_nul = true; 173 line_end++; 174 } 175 176 /* When not at the end of data, the line ending char ('\r' or 177 * '\n') must follow */ 178 if (line_end < end && *line_end == '\r') 179 line_end++; 180 if (line_end < end && *line_end == '\n') 181 line_end++; 182 183 /* Record the found line as diff atom */ 184 struct diff_atom *atom; 185 ARRAYLIST_ADD(atom, d->atoms); 186 if (!atom) 187 return ENOMEM; 188 189 *atom = (struct diff_atom){ 190 .root = d, 191 .pos = (off_t)(line_start - d->data), 192 .at = line_start, 193 .len = line_end - line_start, 194 .hash = hash, 195 }; 196 197 /* Starting point for next line: */ 198 pos = line_end; 199 } 200 sigaction(SIGBUS, &oact, NULL); 201 202 /* File are considered binary if they contain embedded '\0' bytes. */ 203 if (embedded_nul) 204 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; 205 206 return DIFF_RC_OK; 207 } 208 209 static int 210 diff_data_atomize_text_lines(struct diff_data *d) 211 { 212 if (d->data == NULL) 213 return diff_data_atomize_text_lines_fd(d); 214 else 215 return diff_data_atomize_text_lines_mmap(d); 216 } 217 218 int 219 diff_atomize_text_by_line(void *func_data, struct diff_data *d) 220 { 221 return diff_data_atomize_text_lines(d); 222 } 223