xref: /freebsd/contrib/libdiff/lib/diff_atomize_text.c (revision a689bfa4e25af8307709dc12f75b0e02a65abf18)
1 /* Split source by line breaks, and calculate a simplistic checksum. */
2 /*
3  * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <errno.h>
19 #include <setjmp.h>
20 #include <signal.h>
21 #include <stdbool.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <unistd.h>
26 #include <ctype.h>
27 
28 #include <arraylist.h>
29 #include <diff_main.h>
30 
31 #include "diff_internal.h"
32 #include "diff_debug.h"
33 
34 unsigned int
diff_atom_hash_update(unsigned int hash,unsigned char atom_byte)35 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
36 {
37 	return hash * 23 + atom_byte;
38 }
39 
40 static int
diff_data_atomize_text_lines_fd(struct diff_data * d)41 diff_data_atomize_text_lines_fd(struct diff_data *d)
42 {
43 	off_t pos = 0;
44 	const off_t end = pos + d->len;
45 	unsigned int array_size_estimate = d->len / 50;
46 	unsigned int pow2 = 1;
47 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
48 	bool embedded_nul = false;
49 
50 	while (array_size_estimate >>= 1)
51 		pow2++;
52 
53 	ARRAYLIST_INIT(d->atoms, 1 << pow2);
54 
55 	if (fseek(d->root->f, 0L, SEEK_SET) == -1)
56 		return errno;
57 
58 	while (pos < end) {
59 		off_t line_end = pos;
60 		unsigned int hash = 0;
61 		unsigned char buf[512];
62 		size_t r, i;
63 		struct diff_atom *atom;
64 		int eol = 0;
65 
66 		while (eol == 0 && line_end < end) {
67 			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
68 			if (r == 0 && ferror(d->root->f))
69 				return EIO;
70 			i = 0;
71 			while (eol == 0 && i < r) {
72 				if (buf[i] != '\r' && buf[i] != '\n') {
73 					if (!ignore_whitespace
74 					    || !isspace((unsigned char)buf[i]))
75 						hash = diff_atom_hash_update(
76 						    hash, buf[i]);
77 					if (buf[i] == '\0')
78 						embedded_nul = true;
79 					line_end++;
80 				} else
81 					eol = buf[i];
82 				i++;
83 			}
84 		}
85 
86 		/* When not at the end of data, the line ending char ('\r' or
87 		 * '\n') must follow */
88 		if (line_end < end)
89 			line_end++;
90 		/* If that was an '\r', also pull in any following '\n' */
91 		if (line_end < end && eol == '\r') {
92 			if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
93 				return errno;
94 			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
95 			if (r == 0 && ferror(d->root->f))
96 				return EIO;
97 			if (r > 0 && buf[0] == '\n')
98 				line_end++;
99 		}
100 
101 		/* Record the found line as diff atom */
102 		ARRAYLIST_ADD(atom, d->atoms);
103 		if (!atom)
104 			return ENOMEM;
105 
106 		*atom = (struct diff_atom){
107 			.root = d,
108 			.pos = pos,
109 			.at = NULL,	/* atom data is not memory-mapped */
110 			.len = line_end - pos,
111 			.hash = hash,
112 		};
113 
114 		/* Starting point for next line: */
115 		pos = line_end;
116 		if (fseeko(d->root->f, pos, SEEK_SET) == -1)
117 			return errno;
118 	}
119 
120 	/* File are considered binary if they contain embedded '\0' bytes. */
121 	if (embedded_nul)
122 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
123 
124 	return DIFF_RC_OK;
125 }
126 
127 static sigjmp_buf diff_data_signal_env;
128 static void
diff_data_signal_handler(int sig)129 diff_data_signal_handler(int sig)
130 {
131 	siglongjmp(diff_data_signal_env, sig);
132 }
133 
134 static int
diff_data_atomize_text_lines_mmap(struct diff_data * d)135 diff_data_atomize_text_lines_mmap(struct diff_data *d)
136 {
137 	struct sigaction act, oact;
138 	const uint8_t *volatile pos = d->data;
139 	const uint8_t *end = pos + d->len;
140 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
141 	bool embedded_nul = false;
142 	unsigned int array_size_estimate = d->len / 50;
143 	unsigned int pow2 = 1;
144 	int ret = DIFF_RC_OK;
145 	while (array_size_estimate >>= 1)
146 		pow2++;
147 
148 	ARRAYLIST_INIT(d->atoms, 1 << pow2);
149 
150 	sigemptyset(&act.sa_mask);
151 	act.sa_flags = 0;
152 	act.sa_handler = diff_data_signal_handler;
153 	sigaction(SIGBUS, &act, &oact);
154 	if (sigsetjmp(diff_data_signal_env, 0) > 0) {
155 		/*
156 		 * The file was truncated while we were reading it, or an
157 		 * I/O error occurred.  Set the end pointer to the
158 		 * beginning of the line we were trying to read, adjust
159 		 * the file length, and set the return value to an error.
160 		 */
161 		end = pos;
162 		d->len = end - d->data;
163 		ret = EIO;
164 	}
165 	while (pos < end) {
166 		const uint8_t *line_start = pos, *line_end = pos;
167 		unsigned int hash = 0;
168 
169 		while (line_end < end && *line_end != '\r' && *line_end != '\n') {
170 			if (!ignore_whitespace
171 			    || !isspace((unsigned char)*line_end))
172 				hash = diff_atom_hash_update(hash, *line_end);
173 			if (*line_end == '\0')
174 				embedded_nul = true;
175 			line_end++;
176 		}
177 
178 		/* When not at the end of data, the line ending char ('\r' or
179 		 * '\n') must follow */
180 		if (line_end < end && *line_end == '\r')
181 			line_end++;
182 		if (line_end < end && *line_end == '\n')
183 			line_end++;
184 
185 		/* Record the found line as diff atom */
186 		struct diff_atom *atom;
187 		ARRAYLIST_ADD(atom, d->atoms);
188 		if (!atom)
189 			return ENOMEM;
190 
191 		*atom = (struct diff_atom){
192 			.root = d,
193 			.pos = (off_t)(line_start - d->data),
194 			.at = line_start,
195 			.len = line_end - line_start,
196 			.hash = hash,
197 		};
198 
199 		/* Starting point for next line: */
200 		pos = line_end;
201 	}
202 	sigaction(SIGBUS, &oact, NULL);
203 
204 	/* File are considered binary if they contain embedded '\0' bytes. */
205 	if (embedded_nul)
206 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
207 
208 	return ret;
209 }
210 
211 static int
diff_data_atomize_text_lines(struct diff_data * d)212 diff_data_atomize_text_lines(struct diff_data *d)
213 {
214 	if (d->data == NULL)
215 		return diff_data_atomize_text_lines_fd(d);
216 	else
217 		return diff_data_atomize_text_lines_mmap(d);
218 }
219 
220 int
diff_atomize_text_by_line(void * func_data,struct diff_data * d)221 diff_atomize_text_by_line(void *func_data, struct diff_data *d)
222 {
223 	return diff_data_atomize_text_lines(d);
224 }
225