1*59c8e88eSDag-Erling Smørgrav /* Split source by line breaks, and calculate a simplistic checksum. */ 2*59c8e88eSDag-Erling Smørgrav /* 3*59c8e88eSDag-Erling Smørgrav * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> 4*59c8e88eSDag-Erling Smørgrav * 5*59c8e88eSDag-Erling Smørgrav * Permission to use, copy, modify, and distribute this software for any 6*59c8e88eSDag-Erling Smørgrav * purpose with or without fee is hereby granted, provided that the above 7*59c8e88eSDag-Erling Smørgrav * copyright notice and this permission notice appear in all copies. 8*59c8e88eSDag-Erling Smørgrav * 9*59c8e88eSDag-Erling Smørgrav * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10*59c8e88eSDag-Erling Smørgrav * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11*59c8e88eSDag-Erling Smørgrav * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12*59c8e88eSDag-Erling Smørgrav * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13*59c8e88eSDag-Erling Smørgrav * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14*59c8e88eSDag-Erling Smørgrav * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15*59c8e88eSDag-Erling Smørgrav * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16*59c8e88eSDag-Erling Smørgrav */ 17*59c8e88eSDag-Erling Smørgrav 18*59c8e88eSDag-Erling Smørgrav #include <errno.h> 19*59c8e88eSDag-Erling Smørgrav #include <stdbool.h> 20*59c8e88eSDag-Erling Smørgrav #include <stdint.h> 21*59c8e88eSDag-Erling Smørgrav #include <stdio.h> 22*59c8e88eSDag-Erling Smørgrav #include <stdlib.h> 23*59c8e88eSDag-Erling Smørgrav #include <unistd.h> 24*59c8e88eSDag-Erling Smørgrav #include <ctype.h> 25*59c8e88eSDag-Erling Smørgrav 26*59c8e88eSDag-Erling Smørgrav #include <arraylist.h> 27*59c8e88eSDag-Erling Smørgrav #include <diff_main.h> 28*59c8e88eSDag-Erling Smørgrav 29*59c8e88eSDag-Erling Smørgrav #include "diff_internal.h" 30*59c8e88eSDag-Erling Smørgrav #include "diff_debug.h" 31*59c8e88eSDag-Erling Smørgrav 32*59c8e88eSDag-Erling Smørgrav unsigned int 33*59c8e88eSDag-Erling Smørgrav diff_atom_hash_update(unsigned int hash, unsigned char atom_byte) 34*59c8e88eSDag-Erling Smørgrav { 35*59c8e88eSDag-Erling Smørgrav return hash * 23 + atom_byte; 36*59c8e88eSDag-Erling Smørgrav } 37*59c8e88eSDag-Erling Smørgrav 38*59c8e88eSDag-Erling Smørgrav static int 39*59c8e88eSDag-Erling Smørgrav diff_data_atomize_text_lines_fd(struct diff_data *d) 40*59c8e88eSDag-Erling Smørgrav { 41*59c8e88eSDag-Erling Smørgrav off_t pos = 0; 42*59c8e88eSDag-Erling Smørgrav const off_t end = pos + d->len; 43*59c8e88eSDag-Erling Smørgrav unsigned int array_size_estimate = d->len / 50; 44*59c8e88eSDag-Erling Smørgrav unsigned int pow2 = 1; 45*59c8e88eSDag-Erling Smørgrav bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); 46*59c8e88eSDag-Erling Smørgrav bool embedded_nul = false; 47*59c8e88eSDag-Erling Smørgrav 48*59c8e88eSDag-Erling Smørgrav while (array_size_estimate >>= 1) 49*59c8e88eSDag-Erling Smørgrav pow2++; 50*59c8e88eSDag-Erling Smørgrav 51*59c8e88eSDag-Erling Smørgrav ARRAYLIST_INIT(d->atoms, 1 << pow2); 52*59c8e88eSDag-Erling Smørgrav 53*59c8e88eSDag-Erling Smørgrav if (fseek(d->root->f, 0L, SEEK_SET) == -1) 54*59c8e88eSDag-Erling Smørgrav return errno; 55*59c8e88eSDag-Erling Smørgrav 56*59c8e88eSDag-Erling Smørgrav while (pos < end) { 57*59c8e88eSDag-Erling Smørgrav off_t line_end = pos; 58*59c8e88eSDag-Erling Smørgrav unsigned int hash = 0; 59*59c8e88eSDag-Erling Smørgrav unsigned char buf[512]; 60*59c8e88eSDag-Erling Smørgrav size_t r, i; 61*59c8e88eSDag-Erling Smørgrav struct diff_atom *atom; 62*59c8e88eSDag-Erling Smørgrav int eol = 0; 63*59c8e88eSDag-Erling Smørgrav 64*59c8e88eSDag-Erling Smørgrav while (eol == 0 && line_end < end) { 65*59c8e88eSDag-Erling Smørgrav r = fread(buf, sizeof(char), sizeof(buf), d->root->f); 66*59c8e88eSDag-Erling Smørgrav if (r == 0 && ferror(d->root->f)) 67*59c8e88eSDag-Erling Smørgrav return EIO; 68*59c8e88eSDag-Erling Smørgrav i = 0; 69*59c8e88eSDag-Erling Smørgrav while (eol == 0 && i < r) { 70*59c8e88eSDag-Erling Smørgrav if (buf[i] != '\r' && buf[i] != '\n') { 71*59c8e88eSDag-Erling Smørgrav if (!ignore_whitespace 72*59c8e88eSDag-Erling Smørgrav || !isspace((unsigned char)buf[i])) 73*59c8e88eSDag-Erling Smørgrav hash = diff_atom_hash_update( 74*59c8e88eSDag-Erling Smørgrav hash, buf[i]); 75*59c8e88eSDag-Erling Smørgrav if (buf[i] == '\0') 76*59c8e88eSDag-Erling Smørgrav embedded_nul = true; 77*59c8e88eSDag-Erling Smørgrav line_end++; 78*59c8e88eSDag-Erling Smørgrav } else 79*59c8e88eSDag-Erling Smørgrav eol = buf[i]; 80*59c8e88eSDag-Erling Smørgrav i++; 81*59c8e88eSDag-Erling Smørgrav } 82*59c8e88eSDag-Erling Smørgrav } 83*59c8e88eSDag-Erling Smørgrav 84*59c8e88eSDag-Erling Smørgrav /* When not at the end of data, the line ending char ('\r' or 85*59c8e88eSDag-Erling Smørgrav * '\n') must follow */ 86*59c8e88eSDag-Erling Smørgrav if (line_end < end) 87*59c8e88eSDag-Erling Smørgrav line_end++; 88*59c8e88eSDag-Erling Smørgrav /* If that was an '\r', also pull in any following '\n' */ 89*59c8e88eSDag-Erling Smørgrav if (line_end < end && eol == '\r') { 90*59c8e88eSDag-Erling Smørgrav if (fseeko(d->root->f, line_end, SEEK_SET) == -1) 91*59c8e88eSDag-Erling Smørgrav return errno; 92*59c8e88eSDag-Erling Smørgrav r = fread(buf, sizeof(char), sizeof(buf), d->root->f); 93*59c8e88eSDag-Erling Smørgrav if (r == 0 && ferror(d->root->f)) 94*59c8e88eSDag-Erling Smørgrav return EIO; 95*59c8e88eSDag-Erling Smørgrav if (r > 0 && buf[0] == '\n') 96*59c8e88eSDag-Erling Smørgrav line_end++; 97*59c8e88eSDag-Erling Smørgrav } 98*59c8e88eSDag-Erling Smørgrav 99*59c8e88eSDag-Erling Smørgrav /* Record the found line as diff atom */ 100*59c8e88eSDag-Erling Smørgrav ARRAYLIST_ADD(atom, d->atoms); 101*59c8e88eSDag-Erling Smørgrav if (!atom) 102*59c8e88eSDag-Erling Smørgrav return ENOMEM; 103*59c8e88eSDag-Erling Smørgrav 104*59c8e88eSDag-Erling Smørgrav *atom = (struct diff_atom){ 105*59c8e88eSDag-Erling Smørgrav .root = d, 106*59c8e88eSDag-Erling Smørgrav .pos = pos, 107*59c8e88eSDag-Erling Smørgrav .at = NULL, /* atom data is not memory-mapped */ 108*59c8e88eSDag-Erling Smørgrav .len = line_end - pos, 109*59c8e88eSDag-Erling Smørgrav .hash = hash, 110*59c8e88eSDag-Erling Smørgrav }; 111*59c8e88eSDag-Erling Smørgrav 112*59c8e88eSDag-Erling Smørgrav /* Starting point for next line: */ 113*59c8e88eSDag-Erling Smørgrav pos = line_end; 114*59c8e88eSDag-Erling Smørgrav if (fseeko(d->root->f, pos, SEEK_SET) == -1) 115*59c8e88eSDag-Erling Smørgrav return errno; 116*59c8e88eSDag-Erling Smørgrav } 117*59c8e88eSDag-Erling Smørgrav 118*59c8e88eSDag-Erling Smørgrav /* File are considered binary if they contain embedded '\0' bytes. */ 119*59c8e88eSDag-Erling Smørgrav if (embedded_nul) 120*59c8e88eSDag-Erling Smørgrav d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; 121*59c8e88eSDag-Erling Smørgrav 122*59c8e88eSDag-Erling Smørgrav return DIFF_RC_OK; 123*59c8e88eSDag-Erling Smørgrav } 124*59c8e88eSDag-Erling Smørgrav 125*59c8e88eSDag-Erling Smørgrav static int 126*59c8e88eSDag-Erling Smørgrav diff_data_atomize_text_lines_mmap(struct diff_data *d) 127*59c8e88eSDag-Erling Smørgrav { 128*59c8e88eSDag-Erling Smørgrav const uint8_t *pos = d->data; 129*59c8e88eSDag-Erling Smørgrav const uint8_t *end = pos + d->len; 130*59c8e88eSDag-Erling Smørgrav bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); 131*59c8e88eSDag-Erling Smørgrav bool embedded_nul = false; 132*59c8e88eSDag-Erling Smørgrav unsigned int array_size_estimate = d->len / 50; 133*59c8e88eSDag-Erling Smørgrav unsigned int pow2 = 1; 134*59c8e88eSDag-Erling Smørgrav while (array_size_estimate >>= 1) 135*59c8e88eSDag-Erling Smørgrav pow2++; 136*59c8e88eSDag-Erling Smørgrav 137*59c8e88eSDag-Erling Smørgrav ARRAYLIST_INIT(d->atoms, 1 << pow2); 138*59c8e88eSDag-Erling Smørgrav 139*59c8e88eSDag-Erling Smørgrav while (pos < end) { 140*59c8e88eSDag-Erling Smørgrav const uint8_t *line_end = pos; 141*59c8e88eSDag-Erling Smørgrav unsigned int hash = 0; 142*59c8e88eSDag-Erling Smørgrav 143*59c8e88eSDag-Erling Smørgrav while (line_end < end && *line_end != '\r' && *line_end != '\n') { 144*59c8e88eSDag-Erling Smørgrav if (!ignore_whitespace 145*59c8e88eSDag-Erling Smørgrav || !isspace((unsigned char)*line_end)) 146*59c8e88eSDag-Erling Smørgrav hash = diff_atom_hash_update(hash, *line_end); 147*59c8e88eSDag-Erling Smørgrav if (*line_end == '\0') 148*59c8e88eSDag-Erling Smørgrav embedded_nul = true; 149*59c8e88eSDag-Erling Smørgrav line_end++; 150*59c8e88eSDag-Erling Smørgrav } 151*59c8e88eSDag-Erling Smørgrav 152*59c8e88eSDag-Erling Smørgrav /* When not at the end of data, the line ending char ('\r' or 153*59c8e88eSDag-Erling Smørgrav * '\n') must follow */ 154*59c8e88eSDag-Erling Smørgrav if (line_end < end && *line_end == '\r') 155*59c8e88eSDag-Erling Smørgrav line_end++; 156*59c8e88eSDag-Erling Smørgrav if (line_end < end && *line_end == '\n') 157*59c8e88eSDag-Erling Smørgrav line_end++; 158*59c8e88eSDag-Erling Smørgrav 159*59c8e88eSDag-Erling Smørgrav /* Record the found line as diff atom */ 160*59c8e88eSDag-Erling Smørgrav struct diff_atom *atom; 161*59c8e88eSDag-Erling Smørgrav ARRAYLIST_ADD(atom, d->atoms); 162*59c8e88eSDag-Erling Smørgrav if (!atom) 163*59c8e88eSDag-Erling Smørgrav return ENOMEM; 164*59c8e88eSDag-Erling Smørgrav 165*59c8e88eSDag-Erling Smørgrav *atom = (struct diff_atom){ 166*59c8e88eSDag-Erling Smørgrav .root = d, 167*59c8e88eSDag-Erling Smørgrav .pos = (off_t)(pos - d->data), 168*59c8e88eSDag-Erling Smørgrav .at = pos, 169*59c8e88eSDag-Erling Smørgrav .len = line_end - pos, 170*59c8e88eSDag-Erling Smørgrav .hash = hash, 171*59c8e88eSDag-Erling Smørgrav }; 172*59c8e88eSDag-Erling Smørgrav 173*59c8e88eSDag-Erling Smørgrav /* Starting point for next line: */ 174*59c8e88eSDag-Erling Smørgrav pos = line_end; 175*59c8e88eSDag-Erling Smørgrav } 176*59c8e88eSDag-Erling Smørgrav 177*59c8e88eSDag-Erling Smørgrav /* File are considered binary if they contain embedded '\0' bytes. */ 178*59c8e88eSDag-Erling Smørgrav if (embedded_nul) 179*59c8e88eSDag-Erling Smørgrav d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; 180*59c8e88eSDag-Erling Smørgrav 181*59c8e88eSDag-Erling Smørgrav return DIFF_RC_OK; 182*59c8e88eSDag-Erling Smørgrav } 183*59c8e88eSDag-Erling Smørgrav 184*59c8e88eSDag-Erling Smørgrav static int 185*59c8e88eSDag-Erling Smørgrav diff_data_atomize_text_lines(struct diff_data *d) 186*59c8e88eSDag-Erling Smørgrav { 187*59c8e88eSDag-Erling Smørgrav if (d->data == NULL) 188*59c8e88eSDag-Erling Smørgrav return diff_data_atomize_text_lines_fd(d); 189*59c8e88eSDag-Erling Smørgrav else 190*59c8e88eSDag-Erling Smørgrav return diff_data_atomize_text_lines_mmap(d); 191*59c8e88eSDag-Erling Smørgrav } 192*59c8e88eSDag-Erling Smørgrav 193*59c8e88eSDag-Erling Smørgrav int 194*59c8e88eSDag-Erling Smørgrav diff_atomize_text_by_line(void *func_data, struct diff_data *d) 195*59c8e88eSDag-Erling Smørgrav { 196*59c8e88eSDag-Erling Smørgrav return diff_data_atomize_text_lines(d); 197*59c8e88eSDag-Erling Smørgrav } 198