1 /* Split source by line breaks, and calculate a simplistic checksum. */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18 #include <errno.h>
19 #include <setjmp.h>
20 #include <signal.h>
21 #include <stdbool.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <unistd.h>
26 #include <ctype.h>
27
28 #include <arraylist.h>
29 #include <diff_main.h>
30
31 #include "diff_internal.h"
32 #include "diff_debug.h"
33
34 unsigned int
diff_atom_hash_update(unsigned int hash,unsigned char atom_byte)35 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
36 {
37 return hash * 23 + atom_byte;
38 }
39
40 static int
diff_data_atomize_text_lines_fd(struct diff_data * d)41 diff_data_atomize_text_lines_fd(struct diff_data *d)
42 {
43 off_t pos = 0;
44 const off_t end = pos + d->len;
45 unsigned int array_size_estimate = d->len / 50;
46 unsigned int pow2 = 1;
47 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
48 bool embedded_nul = false;
49
50 while (array_size_estimate >>= 1)
51 pow2++;
52
53 ARRAYLIST_INIT(d->atoms, 1 << pow2);
54
55 if (fseek(d->root->f, 0L, SEEK_SET) == -1)
56 return errno;
57
58 while (pos < end) {
59 off_t line_end = pos;
60 unsigned int hash = 0;
61 unsigned char buf[512];
62 size_t r, i;
63 struct diff_atom *atom;
64 int eol = 0;
65
66 while (eol == 0 && line_end < end) {
67 r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
68 if (r == 0 && ferror(d->root->f))
69 return EIO;
70 i = 0;
71 while (eol == 0 && i < r) {
72 if (buf[i] != '\r' && buf[i] != '\n') {
73 if (!ignore_whitespace
74 || !isspace((unsigned char)buf[i]))
75 hash = diff_atom_hash_update(
76 hash, buf[i]);
77 if (buf[i] == '\0')
78 embedded_nul = true;
79 line_end++;
80 } else
81 eol = buf[i];
82 i++;
83 }
84 }
85
86 /* When not at the end of data, the line ending char ('\r' or
87 * '\n') must follow */
88 if (line_end < end)
89 line_end++;
90 /* If that was an '\r', also pull in any following '\n' */
91 if (line_end < end && eol == '\r') {
92 if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
93 return errno;
94 r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
95 if (r == 0 && ferror(d->root->f))
96 return EIO;
97 if (r > 0 && buf[0] == '\n')
98 line_end++;
99 }
100
101 /* Record the found line as diff atom */
102 ARRAYLIST_ADD(atom, d->atoms);
103 if (!atom)
104 return ENOMEM;
105
106 *atom = (struct diff_atom){
107 .root = d,
108 .pos = pos,
109 .at = NULL, /* atom data is not memory-mapped */
110 .len = line_end - pos,
111 .hash = hash,
112 };
113
114 /* Starting point for next line: */
115 pos = line_end;
116 if (fseeko(d->root->f, pos, SEEK_SET) == -1)
117 return errno;
118 }
119
120 /* File are considered binary if they contain embedded '\0' bytes. */
121 if (embedded_nul)
122 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
123
124 return DIFF_RC_OK;
125 }
126
127 static sigjmp_buf diff_data_signal_env;
128 static void
diff_data_signal_handler(int sig)129 diff_data_signal_handler(int sig)
130 {
131 siglongjmp(diff_data_signal_env, sig);
132 }
133
134 static int
diff_data_atomize_text_lines_mmap(struct diff_data * d)135 diff_data_atomize_text_lines_mmap(struct diff_data *d)
136 {
137 struct sigaction act, oact;
138 const uint8_t *volatile pos = d->data;
139 const uint8_t *end = pos + d->len;
140 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
141 bool embedded_nul = false;
142 unsigned int array_size_estimate = d->len / 50;
143 unsigned int pow2 = 1;
144 int ret = DIFF_RC_OK;
145 while (array_size_estimate >>= 1)
146 pow2++;
147
148 ARRAYLIST_INIT(d->atoms, 1 << pow2);
149
150 sigemptyset(&act.sa_mask);
151 act.sa_flags = 0;
152 act.sa_handler = diff_data_signal_handler;
153 sigaction(SIGBUS, &act, &oact);
154 if (sigsetjmp(diff_data_signal_env, 0) > 0) {
155 /*
156 * The file was truncated while we were reading it, or an
157 * I/O error occurred. Set the end pointer to the
158 * beginning of the line we were trying to read, adjust
159 * the file length, and set the return value to an error.
160 */
161 end = pos;
162 d->len = end - d->data;
163 ret = EIO;
164 }
165 while (pos < end) {
166 const uint8_t *line_start = pos, *line_end = pos;
167 unsigned int hash = 0;
168
169 while (line_end < end && *line_end != '\r' && *line_end != '\n') {
170 if (!ignore_whitespace
171 || !isspace((unsigned char)*line_end))
172 hash = diff_atom_hash_update(hash, *line_end);
173 if (*line_end == '\0')
174 embedded_nul = true;
175 line_end++;
176 }
177
178 /* When not at the end of data, the line ending char ('\r' or
179 * '\n') must follow */
180 if (line_end < end && *line_end == '\r')
181 line_end++;
182 if (line_end < end && *line_end == '\n')
183 line_end++;
184
185 /* Record the found line as diff atom */
186 struct diff_atom *atom;
187 ARRAYLIST_ADD(atom, d->atoms);
188 if (!atom)
189 return ENOMEM;
190
191 *atom = (struct diff_atom){
192 .root = d,
193 .pos = (off_t)(line_start - d->data),
194 .at = line_start,
195 .len = line_end - line_start,
196 .hash = hash,
197 };
198
199 /* Starting point for next line: */
200 pos = line_end;
201 }
202 sigaction(SIGBUS, &oact, NULL);
203
204 /* File are considered binary if they contain embedded '\0' bytes. */
205 if (embedded_nul)
206 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
207
208 return ret;
209 }
210
211 static int
diff_data_atomize_text_lines(struct diff_data * d)212 diff_data_atomize_text_lines(struct diff_data *d)
213 {
214 if (d->data == NULL)
215 return diff_data_atomize_text_lines_fd(d);
216 else
217 return diff_data_atomize_text_lines_mmap(d);
218 }
219
220 int
diff_atomize_text_by_line(void * func_data,struct diff_data * d)221 diff_atomize_text_by_line(void *func_data, struct diff_data *d)
222 {
223 return diff_data_atomize_text_lines(d);
224 }
225