1 /* Split source by line breaks, and calculate a simplistic checksum. */ 2 /* 3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <errno.h> 19 #include <setjmp.h> 20 #include <signal.h> 21 #include <stdbool.h> 22 #include <stdint.h> 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include <unistd.h> 26 #include <ctype.h> 27 28 #include <arraylist.h> 29 #include <diff_main.h> 30 31 #include "diff_internal.h" 32 #include "diff_debug.h" 33 34 unsigned int 35 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte) 36 { 37 return hash * 23 + atom_byte; 38 } 39 40 static int 41 diff_data_atomize_text_lines_fd(struct diff_data *d) 42 { 43 off_t pos = 0; 44 const off_t end = pos + d->len; 45 unsigned int array_size_estimate = d->len / 50; 46 unsigned int pow2 = 1; 47 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); 48 bool embedded_nul = false; 49 50 while (array_size_estimate >>= 1) 51 pow2++; 52 53 ARRAYLIST_INIT(d->atoms, 1 << pow2); 54 55 if (fseek(d->root->f, 0L, SEEK_SET) == -1) 56 return errno; 57 58 while (pos < end) { 59 off_t line_end = pos; 60 unsigned int hash = 0; 61 unsigned char buf[512]; 62 size_t r, i; 63 struct diff_atom *atom; 64 int eol = 0; 65 66 while (eol == 0 && line_end < end) { 67 r = fread(buf, sizeof(char), sizeof(buf), d->root->f); 68 if (r == 0 && ferror(d->root->f)) 69 return EIO; 70 i = 0; 71 while (eol == 0 && i < r) { 72 if (buf[i] != '\r' && buf[i] != '\n') { 73 if (!ignore_whitespace 74 || !isspace((unsigned char)buf[i])) 75 hash = diff_atom_hash_update( 76 hash, buf[i]); 77 if (buf[i] == '\0') 78 embedded_nul = true; 79 line_end++; 80 } else 81 eol = buf[i]; 82 i++; 83 } 84 } 85 86 /* When not at the end of data, the line ending char ('\r' or 87 * '\n') must follow */ 88 if (line_end < end) 89 line_end++; 90 /* If that was an '\r', also pull in any following '\n' */ 91 if (line_end < end && eol == '\r') { 92 if (fseeko(d->root->f, line_end, SEEK_SET) == -1) 93 return errno; 94 r = fread(buf, sizeof(char), sizeof(buf), d->root->f); 95 if (r == 0 && ferror(d->root->f)) 96 return EIO; 97 if (r > 0 && buf[0] == '\n') 98 line_end++; 99 } 100 101 /* Record the found line as diff atom */ 102 ARRAYLIST_ADD(atom, d->atoms); 103 if (!atom) 104 return ENOMEM; 105 106 *atom = (struct diff_atom){ 107 .root = d, 108 .pos = pos, 109 .at = NULL, /* atom data is not memory-mapped */ 110 .len = line_end - pos, 111 .hash = hash, 112 }; 113 114 /* Starting point for next line: */ 115 pos = line_end; 116 if (fseeko(d->root->f, pos, SEEK_SET) == -1) 117 return errno; 118 } 119 120 /* File are considered binary if they contain embedded '\0' bytes. */ 121 if (embedded_nul) 122 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; 123 124 return DIFF_RC_OK; 125 } 126 127 static sigjmp_buf diff_data_signal_env; 128 static void 129 diff_data_signal_handler(int sig) 130 { 131 siglongjmp(diff_data_signal_env, sig); 132 } 133 134 static int 135 diff_data_atomize_text_lines_mmap(struct diff_data *d) 136 { 137 struct sigaction act, oact; 138 const uint8_t *volatile pos = d->data; 139 const uint8_t *end = pos + d->len; 140 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); 141 bool embedded_nul = false; 142 unsigned int array_size_estimate = d->len / 50; 143 unsigned int pow2 = 1; 144 int ret = DIFF_RC_OK; 145 while (array_size_estimate >>= 1) 146 pow2++; 147 148 ARRAYLIST_INIT(d->atoms, 1 << pow2); 149 150 sigemptyset(&act.sa_mask); 151 act.sa_flags = 0; 152 act.sa_handler = diff_data_signal_handler; 153 sigaction(SIGBUS, &act, &oact); 154 if (sigsetjmp(diff_data_signal_env, 0) > 0) { 155 /* 156 * The file was truncated while we were reading it, or an 157 * I/O error occurred. Set the end pointer to the 158 * beginning of the line we were trying to read, adjust 159 * the file length, and set the return value to an error. 160 */ 161 end = pos; 162 d->len = end - d->data; 163 ret = EIO; 164 } 165 while (pos < end) { 166 const uint8_t *line_start = pos, *line_end = pos; 167 unsigned int hash = 0; 168 169 while (line_end < end && *line_end != '\r' && *line_end != '\n') { 170 if (!ignore_whitespace 171 || !isspace((unsigned char)*line_end)) 172 hash = diff_atom_hash_update(hash, *line_end); 173 if (*line_end == '\0') 174 embedded_nul = true; 175 line_end++; 176 } 177 178 /* When not at the end of data, the line ending char ('\r' or 179 * '\n') must follow */ 180 if (line_end < end && *line_end == '\r') 181 line_end++; 182 if (line_end < end && *line_end == '\n') 183 line_end++; 184 185 /* Record the found line as diff atom */ 186 struct diff_atom *atom; 187 ARRAYLIST_ADD(atom, d->atoms); 188 if (!atom) 189 return ENOMEM; 190 191 *atom = (struct diff_atom){ 192 .root = d, 193 .pos = (off_t)(line_start - d->data), 194 .at = line_start, 195 .len = line_end - line_start, 196 .hash = hash, 197 }; 198 199 /* Starting point for next line: */ 200 pos = line_end; 201 } 202 sigaction(SIGBUS, &oact, NULL); 203 204 /* File are considered binary if they contain embedded '\0' bytes. */ 205 if (embedded_nul) 206 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; 207 208 return ret; 209 } 210 211 static int 212 diff_data_atomize_text_lines(struct diff_data *d) 213 { 214 if (d->data == NULL) 215 return diff_data_atomize_text_lines_fd(d); 216 else 217 return diff_data_atomize_text_lines_mmap(d); 218 } 219 220 int 221 diff_atomize_text_by_line(void *func_data, struct diff_data *d) 222 { 223 return diff_data_atomize_text_lines(d); 224 } 225