1 /* Split source by line breaks, and calculate a simplistic checksum. */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18 #include <errno.h>
19 #include <setjmp.h>
20 #include <signal.h>
21 #include <stdbool.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <unistd.h>
26 #include <ctype.h>
27
28 #include <arraylist.h>
29 #include <diff_main.h>
30
31 #include "diff_internal.h"
32 #include "diff_debug.h"
33
34 unsigned int
diff_atom_hash_update(unsigned int hash,unsigned char atom_byte)35 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
36 {
37 return hash * 23 + atom_byte;
38 }
39
40 static int
diff_data_atomize_text_lines_fd(struct diff_data * d)41 diff_data_atomize_text_lines_fd(struct diff_data *d)
42 {
43 off_t pos = 0;
44 const off_t end = pos + d->len;
45 unsigned int array_size_estimate = d->len / 50;
46 unsigned int pow2 = 1;
47 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
48 bool embedded_nul = false;
49
50 while (array_size_estimate >>= 1)
51 pow2++;
52
53 ARRAYLIST_INIT(d->atoms, 1 << pow2);
54
55 if (fseek(d->root->f, 0L, SEEK_SET) == -1)
56 return errno;
57
58 while (pos < end) {
59 off_t line_end = pos;
60 unsigned int hash = 0;
61 unsigned char buf[512];
62 size_t r, i;
63 struct diff_atom *atom;
64 int eol = 0;
65
66 while (eol == 0 && line_end < end) {
67 r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
68 if (r == 0 && ferror(d->root->f))
69 return EIO;
70 i = 0;
71 while (eol == 0 && i < r) {
72 if (buf[i] != '\r' && buf[i] != '\n') {
73 if (!ignore_whitespace
74 || !isspace((unsigned char)buf[i]))
75 hash = diff_atom_hash_update(
76 hash, buf[i]);
77 if (buf[i] == '\0')
78 embedded_nul = true;
79 line_end++;
80 } else
81 eol = buf[i];
82 i++;
83 }
84 }
85
86 /* When not at the end of data, the line ending char ('\r' or
87 * '\n') must follow */
88 if (line_end < end)
89 line_end++;
90 /* If that was an '\r', also pull in any following '\n' */
91 if (line_end < end && eol == '\r') {
92 if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
93 return errno;
94 r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
95 if (r == 0 && ferror(d->root->f))
96 return EIO;
97 if (r > 0 && buf[0] == '\n')
98 line_end++;
99 }
100
101 /* Record the found line as diff atom */
102 ARRAYLIST_ADD(atom, d->atoms);
103 if (!atom)
104 return ENOMEM;
105
106 *atom = (struct diff_atom){
107 .root = d,
108 .pos = pos,
109 .at = NULL, /* atom data is not memory-mapped */
110 .len = line_end - pos,
111 .hash = hash,
112 };
113
114 /* Starting point for next line: */
115 pos = line_end;
116 if (fseeko(d->root->f, pos, SEEK_SET) == -1)
117 return errno;
118 }
119
120 /* File are considered binary if they contain embedded '\0' bytes. */
121 if (embedded_nul)
122 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
123
124 return DIFF_RC_OK;
125 }
126
127 static sigjmp_buf diff_data_signal_env;
128 static void
diff_data_signal_handler(int sig)129 diff_data_signal_handler(int sig)
130 {
131 siglongjmp(diff_data_signal_env, sig);
132 }
133
134 static int
diff_data_atomize_text_lines_mmap(struct diff_data * d)135 diff_data_atomize_text_lines_mmap(struct diff_data *d)
136 {
137 struct sigaction act, oact;
138 const uint8_t *volatile pos = d->data;
139 const uint8_t *end = pos + d->len;
140 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
141 bool embedded_nul = false;
142 unsigned int array_size_estimate = d->len / 50;
143 unsigned int pow2 = 1;
144 while (array_size_estimate >>= 1)
145 pow2++;
146
147 ARRAYLIST_INIT(d->atoms, 1 << pow2);
148
149 sigemptyset(&act.sa_mask);
150 act.sa_flags = 0;
151 act.sa_handler = diff_data_signal_handler;
152 sigaction(SIGBUS, &act, &oact);
153 if (sigsetjmp(diff_data_signal_env, 0) > 0) {
154 /*
155 * The file was truncated while we were reading it. Set
156 * the end pointer to the beginning of the line we were
157 * trying to read, adjust the file length, and set a flag.
158 */
159 end = pos;
160 d->len = end - d->data;
161 d->atomizer_flags |= DIFF_ATOMIZER_FILE_TRUNCATED;
162 }
163 while (pos < end) {
164 const uint8_t *line_start = pos, *line_end = pos;
165 unsigned int hash = 0;
166
167 while (line_end < end && *line_end != '\r' && *line_end != '\n') {
168 if (!ignore_whitespace
169 || !isspace((unsigned char)*line_end))
170 hash = diff_atom_hash_update(hash, *line_end);
171 if (*line_end == '\0')
172 embedded_nul = true;
173 line_end++;
174 }
175
176 /* When not at the end of data, the line ending char ('\r' or
177 * '\n') must follow */
178 if (line_end < end && *line_end == '\r')
179 line_end++;
180 if (line_end < end && *line_end == '\n')
181 line_end++;
182
183 /* Record the found line as diff atom */
184 struct diff_atom *atom;
185 ARRAYLIST_ADD(atom, d->atoms);
186 if (!atom)
187 return ENOMEM;
188
189 *atom = (struct diff_atom){
190 .root = d,
191 .pos = (off_t)(line_start - d->data),
192 .at = line_start,
193 .len = line_end - line_start,
194 .hash = hash,
195 };
196
197 /* Starting point for next line: */
198 pos = line_end;
199 }
200 sigaction(SIGBUS, &oact, NULL);
201
202 /* File are considered binary if they contain embedded '\0' bytes. */
203 if (embedded_nul)
204 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
205
206 return DIFF_RC_OK;
207 }
208
209 static int
diff_data_atomize_text_lines(struct diff_data * d)210 diff_data_atomize_text_lines(struct diff_data *d)
211 {
212 if (d->data == NULL)
213 return diff_data_atomize_text_lines_fd(d);
214 else
215 return diff_data_atomize_text_lines_mmap(d);
216 }
217
218 int
diff_atomize_text_by_line(void * func_data,struct diff_data * d)219 diff_atomize_text_by_line(void *func_data, struct diff_data *d)
220 {
221 return diff_data_atomize_text_lines(d);
222 }
223