xref: /freebsd/contrib/libdiff/lib/diff_atomize_text.c (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 /* Split source by line breaks, and calculate a simplistic checksum. */
2 /*
3  * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <errno.h>
19 #include <setjmp.h>
20 #include <signal.h>
21 #include <stdbool.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <unistd.h>
26 #include <ctype.h>
27 
28 #include <arraylist.h>
29 #include <diff_main.h>
30 
31 #include "diff_internal.h"
32 #include "diff_debug.h"
33 
34 unsigned int
35 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
36 {
37 	return hash * 23 + atom_byte;
38 }
39 
40 static int
41 diff_data_atomize_text_lines_fd(struct diff_data *d)
42 {
43 	off_t pos = 0;
44 	const off_t end = pos + d->len;
45 	unsigned int array_size_estimate = d->len / 50;
46 	unsigned int pow2 = 1;
47 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
48 	bool embedded_nul = false;
49 
50 	while (array_size_estimate >>= 1)
51 		pow2++;
52 
53 	ARRAYLIST_INIT(d->atoms, 1 << pow2);
54 
55 	if (fseek(d->root->f, 0L, SEEK_SET) == -1)
56 		return errno;
57 
58 	while (pos < end) {
59 		off_t line_end = pos;
60 		unsigned int hash = 0;
61 		unsigned char buf[512];
62 		size_t r, i;
63 		struct diff_atom *atom;
64 		int eol = 0;
65 
66 		while (eol == 0 && line_end < end) {
67 			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
68 			if (r == 0 && ferror(d->root->f))
69 				return EIO;
70 			i = 0;
71 			while (eol == 0 && i < r) {
72 				if (buf[i] != '\r' && buf[i] != '\n') {
73 					if (!ignore_whitespace
74 					    || !isspace((unsigned char)buf[i]))
75 						hash = diff_atom_hash_update(
76 						    hash, buf[i]);
77 					if (buf[i] == '\0')
78 						embedded_nul = true;
79 					line_end++;
80 				} else
81 					eol = buf[i];
82 				i++;
83 			}
84 		}
85 
86 		/* When not at the end of data, the line ending char ('\r' or
87 		 * '\n') must follow */
88 		if (line_end < end)
89 			line_end++;
90 		/* If that was an '\r', also pull in any following '\n' */
91 		if (line_end < end && eol == '\r') {
92 			if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
93 				return errno;
94 			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
95 			if (r == 0 && ferror(d->root->f))
96 				return EIO;
97 			if (r > 0 && buf[0] == '\n')
98 				line_end++;
99 		}
100 
101 		/* Record the found line as diff atom */
102 		ARRAYLIST_ADD(atom, d->atoms);
103 		if (!atom)
104 			return ENOMEM;
105 
106 		*atom = (struct diff_atom){
107 			.root = d,
108 			.pos = pos,
109 			.at = NULL,	/* atom data is not memory-mapped */
110 			.len = line_end - pos,
111 			.hash = hash,
112 		};
113 
114 		/* Starting point for next line: */
115 		pos = line_end;
116 		if (fseeko(d->root->f, pos, SEEK_SET) == -1)
117 			return errno;
118 	}
119 
120 	/* File are considered binary if they contain embedded '\0' bytes. */
121 	if (embedded_nul)
122 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
123 
124 	return DIFF_RC_OK;
125 }
126 
127 static sigjmp_buf diff_data_signal_env;
128 static void
129 diff_data_signal_handler(int sig)
130 {
131 	siglongjmp(diff_data_signal_env, sig);
132 }
133 
134 static int
135 diff_data_atomize_text_lines_mmap(struct diff_data *d)
136 {
137 	struct sigaction act, oact;
138 	const uint8_t *volatile pos = d->data;
139 	const uint8_t *end = pos + d->len;
140 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
141 	bool embedded_nul = false;
142 	unsigned int array_size_estimate = d->len / 50;
143 	unsigned int pow2 = 1;
144 	while (array_size_estimate >>= 1)
145 		pow2++;
146 
147 	ARRAYLIST_INIT(d->atoms, 1 << pow2);
148 
149 	sigemptyset(&act.sa_mask);
150 	act.sa_flags = 0;
151 	act.sa_handler = diff_data_signal_handler;
152 	sigaction(SIGBUS, &act, &oact);
153 	if (sigsetjmp(diff_data_signal_env, 0) > 0) {
154 		/*
155 		 * The file was truncated while we were reading it.  Set
156 		 * the end pointer to the beginning of the line we were
157 		 * trying to read, adjust the file length, and set a flag.
158 		 */
159 		end = pos;
160 		d->len = end - d->data;
161 		d->atomizer_flags |= DIFF_ATOMIZER_FILE_TRUNCATED;
162 	}
163 	while (pos < end) {
164 		const uint8_t *line_start = pos, *line_end = pos;
165 		unsigned int hash = 0;
166 
167 		while (line_end < end && *line_end != '\r' && *line_end != '\n') {
168 			if (!ignore_whitespace
169 			    || !isspace((unsigned char)*line_end))
170 				hash = diff_atom_hash_update(hash, *line_end);
171 			if (*line_end == '\0')
172 				embedded_nul = true;
173 			line_end++;
174 		}
175 
176 		/* When not at the end of data, the line ending char ('\r' or
177 		 * '\n') must follow */
178 		if (line_end < end && *line_end == '\r')
179 			line_end++;
180 		if (line_end < end && *line_end == '\n')
181 			line_end++;
182 
183 		/* Record the found line as diff atom */
184 		struct diff_atom *atom;
185 		ARRAYLIST_ADD(atom, d->atoms);
186 		if (!atom)
187 			return ENOMEM;
188 
189 		*atom = (struct diff_atom){
190 			.root = d,
191 			.pos = (off_t)(line_start - d->data),
192 			.at = line_start,
193 			.len = line_end - line_start,
194 			.hash = hash,
195 		};
196 
197 		/* Starting point for next line: */
198 		pos = line_end;
199 	}
200 	sigaction(SIGBUS, &oact, NULL);
201 
202 	/* File are considered binary if they contain embedded '\0' bytes. */
203 	if (embedded_nul)
204 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
205 
206 	return DIFF_RC_OK;
207 }
208 
209 static int
210 diff_data_atomize_text_lines(struct diff_data *d)
211 {
212 	if (d->data == NULL)
213 		return diff_data_atomize_text_lines_fd(d);
214 	else
215 		return diff_data_atomize_text_lines_mmap(d);
216 }
217 
218 int
219 diff_atomize_text_by_line(void *func_data, struct diff_data *d)
220 {
221 	return diff_data_atomize_text_lines(d);
222 }
223