xref: /freebsd/contrib/libdiff/lib/diff_atomize_text.c (revision 974ea6b297f8f9821bbb60670e2b90ba9989b283)
159c8e88eSDag-Erling Smørgrav /* Split source by line breaks, and calculate a simplistic checksum. */
259c8e88eSDag-Erling Smørgrav /*
359c8e88eSDag-Erling Smørgrav  * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
459c8e88eSDag-Erling Smørgrav  *
559c8e88eSDag-Erling Smørgrav  * Permission to use, copy, modify, and distribute this software for any
659c8e88eSDag-Erling Smørgrav  * purpose with or without fee is hereby granted, provided that the above
759c8e88eSDag-Erling Smørgrav  * copyright notice and this permission notice appear in all copies.
859c8e88eSDag-Erling Smørgrav  *
959c8e88eSDag-Erling Smørgrav  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1059c8e88eSDag-Erling Smørgrav  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1159c8e88eSDag-Erling Smørgrav  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1259c8e88eSDag-Erling Smørgrav  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1359c8e88eSDag-Erling Smørgrav  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1459c8e88eSDag-Erling Smørgrav  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1559c8e88eSDag-Erling Smørgrav  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1659c8e88eSDag-Erling Smørgrav  */
1759c8e88eSDag-Erling Smørgrav 
1859c8e88eSDag-Erling Smørgrav #include <errno.h>
19*974ea6b2SDag-Erling Smørgrav #include <setjmp.h>
20*974ea6b2SDag-Erling Smørgrav #include <signal.h>
2159c8e88eSDag-Erling Smørgrav #include <stdbool.h>
2259c8e88eSDag-Erling Smørgrav #include <stdint.h>
2359c8e88eSDag-Erling Smørgrav #include <stdio.h>
2459c8e88eSDag-Erling Smørgrav #include <stdlib.h>
2559c8e88eSDag-Erling Smørgrav #include <unistd.h>
2659c8e88eSDag-Erling Smørgrav #include <ctype.h>
2759c8e88eSDag-Erling Smørgrav 
2859c8e88eSDag-Erling Smørgrav #include <arraylist.h>
2959c8e88eSDag-Erling Smørgrav #include <diff_main.h>
3059c8e88eSDag-Erling Smørgrav 
3159c8e88eSDag-Erling Smørgrav #include "diff_internal.h"
3259c8e88eSDag-Erling Smørgrav #include "diff_debug.h"
3359c8e88eSDag-Erling Smørgrav 
3459c8e88eSDag-Erling Smørgrav unsigned int
diff_atom_hash_update(unsigned int hash,unsigned char atom_byte)3559c8e88eSDag-Erling Smørgrav diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
3659c8e88eSDag-Erling Smørgrav {
3759c8e88eSDag-Erling Smørgrav 	return hash * 23 + atom_byte;
3859c8e88eSDag-Erling Smørgrav }
3959c8e88eSDag-Erling Smørgrav 
4059c8e88eSDag-Erling Smørgrav static int
diff_data_atomize_text_lines_fd(struct diff_data * d)4159c8e88eSDag-Erling Smørgrav diff_data_atomize_text_lines_fd(struct diff_data *d)
4259c8e88eSDag-Erling Smørgrav {
4359c8e88eSDag-Erling Smørgrav 	off_t pos = 0;
4459c8e88eSDag-Erling Smørgrav 	const off_t end = pos + d->len;
4559c8e88eSDag-Erling Smørgrav 	unsigned int array_size_estimate = d->len / 50;
4659c8e88eSDag-Erling Smørgrav 	unsigned int pow2 = 1;
4759c8e88eSDag-Erling Smørgrav 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
4859c8e88eSDag-Erling Smørgrav 	bool embedded_nul = false;
4959c8e88eSDag-Erling Smørgrav 
5059c8e88eSDag-Erling Smørgrav 	while (array_size_estimate >>= 1)
5159c8e88eSDag-Erling Smørgrav 		pow2++;
5259c8e88eSDag-Erling Smørgrav 
5359c8e88eSDag-Erling Smørgrav 	ARRAYLIST_INIT(d->atoms, 1 << pow2);
5459c8e88eSDag-Erling Smørgrav 
5559c8e88eSDag-Erling Smørgrav 	if (fseek(d->root->f, 0L, SEEK_SET) == -1)
5659c8e88eSDag-Erling Smørgrav 		return errno;
5759c8e88eSDag-Erling Smørgrav 
5859c8e88eSDag-Erling Smørgrav 	while (pos < end) {
5959c8e88eSDag-Erling Smørgrav 		off_t line_end = pos;
6059c8e88eSDag-Erling Smørgrav 		unsigned int hash = 0;
6159c8e88eSDag-Erling Smørgrav 		unsigned char buf[512];
6259c8e88eSDag-Erling Smørgrav 		size_t r, i;
6359c8e88eSDag-Erling Smørgrav 		struct diff_atom *atom;
6459c8e88eSDag-Erling Smørgrav 		int eol = 0;
6559c8e88eSDag-Erling Smørgrav 
6659c8e88eSDag-Erling Smørgrav 		while (eol == 0 && line_end < end) {
6759c8e88eSDag-Erling Smørgrav 			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
6859c8e88eSDag-Erling Smørgrav 			if (r == 0 && ferror(d->root->f))
6959c8e88eSDag-Erling Smørgrav 				return EIO;
7059c8e88eSDag-Erling Smørgrav 			i = 0;
7159c8e88eSDag-Erling Smørgrav 			while (eol == 0 && i < r) {
7259c8e88eSDag-Erling Smørgrav 				if (buf[i] != '\r' && buf[i] != '\n') {
7359c8e88eSDag-Erling Smørgrav 					if (!ignore_whitespace
7459c8e88eSDag-Erling Smørgrav 					    || !isspace((unsigned char)buf[i]))
7559c8e88eSDag-Erling Smørgrav 						hash = diff_atom_hash_update(
7659c8e88eSDag-Erling Smørgrav 						    hash, buf[i]);
7759c8e88eSDag-Erling Smørgrav 					if (buf[i] == '\0')
7859c8e88eSDag-Erling Smørgrav 						embedded_nul = true;
7959c8e88eSDag-Erling Smørgrav 					line_end++;
8059c8e88eSDag-Erling Smørgrav 				} else
8159c8e88eSDag-Erling Smørgrav 					eol = buf[i];
8259c8e88eSDag-Erling Smørgrav 				i++;
8359c8e88eSDag-Erling Smørgrav 			}
8459c8e88eSDag-Erling Smørgrav 		}
8559c8e88eSDag-Erling Smørgrav 
8659c8e88eSDag-Erling Smørgrav 		/* When not at the end of data, the line ending char ('\r' or
8759c8e88eSDag-Erling Smørgrav 		 * '\n') must follow */
8859c8e88eSDag-Erling Smørgrav 		if (line_end < end)
8959c8e88eSDag-Erling Smørgrav 			line_end++;
9059c8e88eSDag-Erling Smørgrav 		/* If that was an '\r', also pull in any following '\n' */
9159c8e88eSDag-Erling Smørgrav 		if (line_end < end && eol == '\r') {
9259c8e88eSDag-Erling Smørgrav 			if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
9359c8e88eSDag-Erling Smørgrav 				return errno;
9459c8e88eSDag-Erling Smørgrav 			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
9559c8e88eSDag-Erling Smørgrav 			if (r == 0 && ferror(d->root->f))
9659c8e88eSDag-Erling Smørgrav 				return EIO;
9759c8e88eSDag-Erling Smørgrav 			if (r > 0 && buf[0] == '\n')
9859c8e88eSDag-Erling Smørgrav 				line_end++;
9959c8e88eSDag-Erling Smørgrav 		}
10059c8e88eSDag-Erling Smørgrav 
10159c8e88eSDag-Erling Smørgrav 		/* Record the found line as diff atom */
10259c8e88eSDag-Erling Smørgrav 		ARRAYLIST_ADD(atom, d->atoms);
10359c8e88eSDag-Erling Smørgrav 		if (!atom)
10459c8e88eSDag-Erling Smørgrav 			return ENOMEM;
10559c8e88eSDag-Erling Smørgrav 
10659c8e88eSDag-Erling Smørgrav 		*atom = (struct diff_atom){
10759c8e88eSDag-Erling Smørgrav 			.root = d,
10859c8e88eSDag-Erling Smørgrav 			.pos = pos,
10959c8e88eSDag-Erling Smørgrav 			.at = NULL,	/* atom data is not memory-mapped */
11059c8e88eSDag-Erling Smørgrav 			.len = line_end - pos,
11159c8e88eSDag-Erling Smørgrav 			.hash = hash,
11259c8e88eSDag-Erling Smørgrav 		};
11359c8e88eSDag-Erling Smørgrav 
11459c8e88eSDag-Erling Smørgrav 		/* Starting point for next line: */
11559c8e88eSDag-Erling Smørgrav 		pos = line_end;
11659c8e88eSDag-Erling Smørgrav 		if (fseeko(d->root->f, pos, SEEK_SET) == -1)
11759c8e88eSDag-Erling Smørgrav 			return errno;
11859c8e88eSDag-Erling Smørgrav 	}
11959c8e88eSDag-Erling Smørgrav 
12059c8e88eSDag-Erling Smørgrav 	/* File are considered binary if they contain embedded '\0' bytes. */
12159c8e88eSDag-Erling Smørgrav 	if (embedded_nul)
12259c8e88eSDag-Erling Smørgrav 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
12359c8e88eSDag-Erling Smørgrav 
12459c8e88eSDag-Erling Smørgrav 	return DIFF_RC_OK;
12559c8e88eSDag-Erling Smørgrav }
12659c8e88eSDag-Erling Smørgrav 
127*974ea6b2SDag-Erling Smørgrav static sigjmp_buf diff_data_signal_env;
128*974ea6b2SDag-Erling Smørgrav static void
diff_data_signal_handler(int sig)129*974ea6b2SDag-Erling Smørgrav diff_data_signal_handler(int sig)
130*974ea6b2SDag-Erling Smørgrav {
131*974ea6b2SDag-Erling Smørgrav 	siglongjmp(diff_data_signal_env, sig);
132*974ea6b2SDag-Erling Smørgrav }
133*974ea6b2SDag-Erling Smørgrav 
13459c8e88eSDag-Erling Smørgrav static int
diff_data_atomize_text_lines_mmap(struct diff_data * d)13559c8e88eSDag-Erling Smørgrav diff_data_atomize_text_lines_mmap(struct diff_data *d)
13659c8e88eSDag-Erling Smørgrav {
137*974ea6b2SDag-Erling Smørgrav 	struct sigaction act, oact;
138*974ea6b2SDag-Erling Smørgrav 	const uint8_t *volatile pos = d->data;
13959c8e88eSDag-Erling Smørgrav 	const uint8_t *end = pos + d->len;
14059c8e88eSDag-Erling Smørgrav 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
14159c8e88eSDag-Erling Smørgrav 	bool embedded_nul = false;
14259c8e88eSDag-Erling Smørgrav 	unsigned int array_size_estimate = d->len / 50;
14359c8e88eSDag-Erling Smørgrav 	unsigned int pow2 = 1;
14459c8e88eSDag-Erling Smørgrav 	while (array_size_estimate >>= 1)
14559c8e88eSDag-Erling Smørgrav 		pow2++;
14659c8e88eSDag-Erling Smørgrav 
14759c8e88eSDag-Erling Smørgrav 	ARRAYLIST_INIT(d->atoms, 1 << pow2);
14859c8e88eSDag-Erling Smørgrav 
149*974ea6b2SDag-Erling Smørgrav 	sigemptyset(&act.sa_mask);
150*974ea6b2SDag-Erling Smørgrav 	act.sa_flags = 0;
151*974ea6b2SDag-Erling Smørgrav 	act.sa_handler = diff_data_signal_handler;
152*974ea6b2SDag-Erling Smørgrav 	sigaction(SIGBUS, &act, &oact);
153*974ea6b2SDag-Erling Smørgrav 	if (sigsetjmp(diff_data_signal_env, 0) > 0) {
154*974ea6b2SDag-Erling Smørgrav 		/*
155*974ea6b2SDag-Erling Smørgrav 		 * The file was truncated while we were reading it.  Set
156*974ea6b2SDag-Erling Smørgrav 		 * the end pointer to the beginning of the line we were
157*974ea6b2SDag-Erling Smørgrav 		 * trying to read, adjust the file length, and set a flag.
158*974ea6b2SDag-Erling Smørgrav 		 */
159*974ea6b2SDag-Erling Smørgrav 		end = pos;
160*974ea6b2SDag-Erling Smørgrav 		d->len = end - d->data;
161*974ea6b2SDag-Erling Smørgrav 		d->atomizer_flags |= DIFF_ATOMIZER_FILE_TRUNCATED;
162*974ea6b2SDag-Erling Smørgrav 	}
16359c8e88eSDag-Erling Smørgrav 	while (pos < end) {
164*974ea6b2SDag-Erling Smørgrav 		const uint8_t *line_start = pos, *line_end = pos;
16559c8e88eSDag-Erling Smørgrav 		unsigned int hash = 0;
16659c8e88eSDag-Erling Smørgrav 
16759c8e88eSDag-Erling Smørgrav 		while (line_end < end && *line_end != '\r' && *line_end != '\n') {
16859c8e88eSDag-Erling Smørgrav 			if (!ignore_whitespace
16959c8e88eSDag-Erling Smørgrav 			    || !isspace((unsigned char)*line_end))
17059c8e88eSDag-Erling Smørgrav 				hash = diff_atom_hash_update(hash, *line_end);
17159c8e88eSDag-Erling Smørgrav 			if (*line_end == '\0')
17259c8e88eSDag-Erling Smørgrav 				embedded_nul = true;
17359c8e88eSDag-Erling Smørgrav 			line_end++;
17459c8e88eSDag-Erling Smørgrav 		}
17559c8e88eSDag-Erling Smørgrav 
17659c8e88eSDag-Erling Smørgrav 		/* When not at the end of data, the line ending char ('\r' or
17759c8e88eSDag-Erling Smørgrav 		 * '\n') must follow */
17859c8e88eSDag-Erling Smørgrav 		if (line_end < end && *line_end == '\r')
17959c8e88eSDag-Erling Smørgrav 			line_end++;
18059c8e88eSDag-Erling Smørgrav 		if (line_end < end && *line_end == '\n')
18159c8e88eSDag-Erling Smørgrav 			line_end++;
18259c8e88eSDag-Erling Smørgrav 
18359c8e88eSDag-Erling Smørgrav 		/* Record the found line as diff atom */
18459c8e88eSDag-Erling Smørgrav 		struct diff_atom *atom;
18559c8e88eSDag-Erling Smørgrav 		ARRAYLIST_ADD(atom, d->atoms);
18659c8e88eSDag-Erling Smørgrav 		if (!atom)
18759c8e88eSDag-Erling Smørgrav 			return ENOMEM;
18859c8e88eSDag-Erling Smørgrav 
18959c8e88eSDag-Erling Smørgrav 		*atom = (struct diff_atom){
19059c8e88eSDag-Erling Smørgrav 			.root = d,
191*974ea6b2SDag-Erling Smørgrav 			.pos = (off_t)(line_start - d->data),
192*974ea6b2SDag-Erling Smørgrav 			.at = line_start,
193*974ea6b2SDag-Erling Smørgrav 			.len = line_end - line_start,
19459c8e88eSDag-Erling Smørgrav 			.hash = hash,
19559c8e88eSDag-Erling Smørgrav 		};
19659c8e88eSDag-Erling Smørgrav 
19759c8e88eSDag-Erling Smørgrav 		/* Starting point for next line: */
19859c8e88eSDag-Erling Smørgrav 		pos = line_end;
19959c8e88eSDag-Erling Smørgrav 	}
200*974ea6b2SDag-Erling Smørgrav 	sigaction(SIGBUS, &oact, NULL);
20159c8e88eSDag-Erling Smørgrav 
20259c8e88eSDag-Erling Smørgrav 	/* File are considered binary if they contain embedded '\0' bytes. */
20359c8e88eSDag-Erling Smørgrav 	if (embedded_nul)
20459c8e88eSDag-Erling Smørgrav 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
20559c8e88eSDag-Erling Smørgrav 
20659c8e88eSDag-Erling Smørgrav 	return DIFF_RC_OK;
20759c8e88eSDag-Erling Smørgrav }
20859c8e88eSDag-Erling Smørgrav 
20959c8e88eSDag-Erling Smørgrav static int
diff_data_atomize_text_lines(struct diff_data * d)21059c8e88eSDag-Erling Smørgrav diff_data_atomize_text_lines(struct diff_data *d)
21159c8e88eSDag-Erling Smørgrav {
21259c8e88eSDag-Erling Smørgrav 	if (d->data == NULL)
21359c8e88eSDag-Erling Smørgrav 		return diff_data_atomize_text_lines_fd(d);
21459c8e88eSDag-Erling Smørgrav 	else
21559c8e88eSDag-Erling Smørgrav 		return diff_data_atomize_text_lines_mmap(d);
21659c8e88eSDag-Erling Smørgrav }
21759c8e88eSDag-Erling Smørgrav 
21859c8e88eSDag-Erling Smørgrav int
diff_atomize_text_by_line(void * func_data,struct diff_data * d)21959c8e88eSDag-Erling Smørgrav diff_atomize_text_by_line(void *func_data, struct diff_data *d)
22059c8e88eSDag-Erling Smørgrav {
22159c8e88eSDag-Erling Smørgrav 	return diff_data_atomize_text_lines(d);
22259c8e88eSDag-Erling Smørgrav }
223