xref: /freebsd/contrib/libdiff/lib/diff_atomize_text.c (revision 59c8e88e72633afbc47a4ace0d2170d00d51f7dc)
1*59c8e88eSDag-Erling Smørgrav /* Split source by line breaks, and calculate a simplistic checksum. */
2*59c8e88eSDag-Erling Smørgrav /*
3*59c8e88eSDag-Erling Smørgrav  * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4*59c8e88eSDag-Erling Smørgrav  *
5*59c8e88eSDag-Erling Smørgrav  * Permission to use, copy, modify, and distribute this software for any
6*59c8e88eSDag-Erling Smørgrav  * purpose with or without fee is hereby granted, provided that the above
7*59c8e88eSDag-Erling Smørgrav  * copyright notice and this permission notice appear in all copies.
8*59c8e88eSDag-Erling Smørgrav  *
9*59c8e88eSDag-Erling Smørgrav  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10*59c8e88eSDag-Erling Smørgrav  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11*59c8e88eSDag-Erling Smørgrav  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12*59c8e88eSDag-Erling Smørgrav  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13*59c8e88eSDag-Erling Smørgrav  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14*59c8e88eSDag-Erling Smørgrav  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15*59c8e88eSDag-Erling Smørgrav  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16*59c8e88eSDag-Erling Smørgrav  */
17*59c8e88eSDag-Erling Smørgrav 
18*59c8e88eSDag-Erling Smørgrav #include <errno.h>
19*59c8e88eSDag-Erling Smørgrav #include <stdbool.h>
20*59c8e88eSDag-Erling Smørgrav #include <stdint.h>
21*59c8e88eSDag-Erling Smørgrav #include <stdio.h>
22*59c8e88eSDag-Erling Smørgrav #include <stdlib.h>
23*59c8e88eSDag-Erling Smørgrav #include <unistd.h>
24*59c8e88eSDag-Erling Smørgrav #include <ctype.h>
25*59c8e88eSDag-Erling Smørgrav 
26*59c8e88eSDag-Erling Smørgrav #include <arraylist.h>
27*59c8e88eSDag-Erling Smørgrav #include <diff_main.h>
28*59c8e88eSDag-Erling Smørgrav 
29*59c8e88eSDag-Erling Smørgrav #include "diff_internal.h"
30*59c8e88eSDag-Erling Smørgrav #include "diff_debug.h"
31*59c8e88eSDag-Erling Smørgrav 
32*59c8e88eSDag-Erling Smørgrav unsigned int
33*59c8e88eSDag-Erling Smørgrav diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
34*59c8e88eSDag-Erling Smørgrav {
35*59c8e88eSDag-Erling Smørgrav 	return hash * 23 + atom_byte;
36*59c8e88eSDag-Erling Smørgrav }
37*59c8e88eSDag-Erling Smørgrav 
38*59c8e88eSDag-Erling Smørgrav static int
39*59c8e88eSDag-Erling Smørgrav diff_data_atomize_text_lines_fd(struct diff_data *d)
40*59c8e88eSDag-Erling Smørgrav {
41*59c8e88eSDag-Erling Smørgrav 	off_t pos = 0;
42*59c8e88eSDag-Erling Smørgrav 	const off_t end = pos + d->len;
43*59c8e88eSDag-Erling Smørgrav 	unsigned int array_size_estimate = d->len / 50;
44*59c8e88eSDag-Erling Smørgrav 	unsigned int pow2 = 1;
45*59c8e88eSDag-Erling Smørgrav 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
46*59c8e88eSDag-Erling Smørgrav 	bool embedded_nul = false;
47*59c8e88eSDag-Erling Smørgrav 
48*59c8e88eSDag-Erling Smørgrav 	while (array_size_estimate >>= 1)
49*59c8e88eSDag-Erling Smørgrav 		pow2++;
50*59c8e88eSDag-Erling Smørgrav 
51*59c8e88eSDag-Erling Smørgrav 	ARRAYLIST_INIT(d->atoms, 1 << pow2);
52*59c8e88eSDag-Erling Smørgrav 
53*59c8e88eSDag-Erling Smørgrav 	if (fseek(d->root->f, 0L, SEEK_SET) == -1)
54*59c8e88eSDag-Erling Smørgrav 		return errno;
55*59c8e88eSDag-Erling Smørgrav 
56*59c8e88eSDag-Erling Smørgrav 	while (pos < end) {
57*59c8e88eSDag-Erling Smørgrav 		off_t line_end = pos;
58*59c8e88eSDag-Erling Smørgrav 		unsigned int hash = 0;
59*59c8e88eSDag-Erling Smørgrav 		unsigned char buf[512];
60*59c8e88eSDag-Erling Smørgrav 		size_t r, i;
61*59c8e88eSDag-Erling Smørgrav 		struct diff_atom *atom;
62*59c8e88eSDag-Erling Smørgrav 		int eol = 0;
63*59c8e88eSDag-Erling Smørgrav 
64*59c8e88eSDag-Erling Smørgrav 		while (eol == 0 && line_end < end) {
65*59c8e88eSDag-Erling Smørgrav 			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
66*59c8e88eSDag-Erling Smørgrav 			if (r == 0 && ferror(d->root->f))
67*59c8e88eSDag-Erling Smørgrav 				return EIO;
68*59c8e88eSDag-Erling Smørgrav 			i = 0;
69*59c8e88eSDag-Erling Smørgrav 			while (eol == 0 && i < r) {
70*59c8e88eSDag-Erling Smørgrav 				if (buf[i] != '\r' && buf[i] != '\n') {
71*59c8e88eSDag-Erling Smørgrav 					if (!ignore_whitespace
72*59c8e88eSDag-Erling Smørgrav 					    || !isspace((unsigned char)buf[i]))
73*59c8e88eSDag-Erling Smørgrav 						hash = diff_atom_hash_update(
74*59c8e88eSDag-Erling Smørgrav 						    hash, buf[i]);
75*59c8e88eSDag-Erling Smørgrav 					if (buf[i] == '\0')
76*59c8e88eSDag-Erling Smørgrav 						embedded_nul = true;
77*59c8e88eSDag-Erling Smørgrav 					line_end++;
78*59c8e88eSDag-Erling Smørgrav 				} else
79*59c8e88eSDag-Erling Smørgrav 					eol = buf[i];
80*59c8e88eSDag-Erling Smørgrav 				i++;
81*59c8e88eSDag-Erling Smørgrav 			}
82*59c8e88eSDag-Erling Smørgrav 		}
83*59c8e88eSDag-Erling Smørgrav 
84*59c8e88eSDag-Erling Smørgrav 		/* When not at the end of data, the line ending char ('\r' or
85*59c8e88eSDag-Erling Smørgrav 		 * '\n') must follow */
86*59c8e88eSDag-Erling Smørgrav 		if (line_end < end)
87*59c8e88eSDag-Erling Smørgrav 			line_end++;
88*59c8e88eSDag-Erling Smørgrav 		/* If that was an '\r', also pull in any following '\n' */
89*59c8e88eSDag-Erling Smørgrav 		if (line_end < end && eol == '\r') {
90*59c8e88eSDag-Erling Smørgrav 			if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
91*59c8e88eSDag-Erling Smørgrav 				return errno;
92*59c8e88eSDag-Erling Smørgrav 			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
93*59c8e88eSDag-Erling Smørgrav 			if (r == 0 && ferror(d->root->f))
94*59c8e88eSDag-Erling Smørgrav 				return EIO;
95*59c8e88eSDag-Erling Smørgrav 			if (r > 0 && buf[0] == '\n')
96*59c8e88eSDag-Erling Smørgrav 				line_end++;
97*59c8e88eSDag-Erling Smørgrav 		}
98*59c8e88eSDag-Erling Smørgrav 
99*59c8e88eSDag-Erling Smørgrav 		/* Record the found line as diff atom */
100*59c8e88eSDag-Erling Smørgrav 		ARRAYLIST_ADD(atom, d->atoms);
101*59c8e88eSDag-Erling Smørgrav 		if (!atom)
102*59c8e88eSDag-Erling Smørgrav 			return ENOMEM;
103*59c8e88eSDag-Erling Smørgrav 
104*59c8e88eSDag-Erling Smørgrav 		*atom = (struct diff_atom){
105*59c8e88eSDag-Erling Smørgrav 			.root = d,
106*59c8e88eSDag-Erling Smørgrav 			.pos = pos,
107*59c8e88eSDag-Erling Smørgrav 			.at = NULL,	/* atom data is not memory-mapped */
108*59c8e88eSDag-Erling Smørgrav 			.len = line_end - pos,
109*59c8e88eSDag-Erling Smørgrav 			.hash = hash,
110*59c8e88eSDag-Erling Smørgrav 		};
111*59c8e88eSDag-Erling Smørgrav 
112*59c8e88eSDag-Erling Smørgrav 		/* Starting point for next line: */
113*59c8e88eSDag-Erling Smørgrav 		pos = line_end;
114*59c8e88eSDag-Erling Smørgrav 		if (fseeko(d->root->f, pos, SEEK_SET) == -1)
115*59c8e88eSDag-Erling Smørgrav 			return errno;
116*59c8e88eSDag-Erling Smørgrav 	}
117*59c8e88eSDag-Erling Smørgrav 
118*59c8e88eSDag-Erling Smørgrav 	/* File are considered binary if they contain embedded '\0' bytes. */
119*59c8e88eSDag-Erling Smørgrav 	if (embedded_nul)
120*59c8e88eSDag-Erling Smørgrav 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
121*59c8e88eSDag-Erling Smørgrav 
122*59c8e88eSDag-Erling Smørgrav 	return DIFF_RC_OK;
123*59c8e88eSDag-Erling Smørgrav }
124*59c8e88eSDag-Erling Smørgrav 
125*59c8e88eSDag-Erling Smørgrav static int
126*59c8e88eSDag-Erling Smørgrav diff_data_atomize_text_lines_mmap(struct diff_data *d)
127*59c8e88eSDag-Erling Smørgrav {
128*59c8e88eSDag-Erling Smørgrav 	const uint8_t *pos = d->data;
129*59c8e88eSDag-Erling Smørgrav 	const uint8_t *end = pos + d->len;
130*59c8e88eSDag-Erling Smørgrav 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
131*59c8e88eSDag-Erling Smørgrav 	bool embedded_nul = false;
132*59c8e88eSDag-Erling Smørgrav 	unsigned int array_size_estimate = d->len / 50;
133*59c8e88eSDag-Erling Smørgrav 	unsigned int pow2 = 1;
134*59c8e88eSDag-Erling Smørgrav 	while (array_size_estimate >>= 1)
135*59c8e88eSDag-Erling Smørgrav 		pow2++;
136*59c8e88eSDag-Erling Smørgrav 
137*59c8e88eSDag-Erling Smørgrav 	ARRAYLIST_INIT(d->atoms, 1 << pow2);
138*59c8e88eSDag-Erling Smørgrav 
139*59c8e88eSDag-Erling Smørgrav 	while (pos < end) {
140*59c8e88eSDag-Erling Smørgrav 		const uint8_t *line_end = pos;
141*59c8e88eSDag-Erling Smørgrav 		unsigned int hash = 0;
142*59c8e88eSDag-Erling Smørgrav 
143*59c8e88eSDag-Erling Smørgrav 		while (line_end < end && *line_end != '\r' && *line_end != '\n') {
144*59c8e88eSDag-Erling Smørgrav 			if (!ignore_whitespace
145*59c8e88eSDag-Erling Smørgrav 			    || !isspace((unsigned char)*line_end))
146*59c8e88eSDag-Erling Smørgrav 				hash = diff_atom_hash_update(hash, *line_end);
147*59c8e88eSDag-Erling Smørgrav 			if (*line_end == '\0')
148*59c8e88eSDag-Erling Smørgrav 				embedded_nul = true;
149*59c8e88eSDag-Erling Smørgrav 			line_end++;
150*59c8e88eSDag-Erling Smørgrav 		}
151*59c8e88eSDag-Erling Smørgrav 
152*59c8e88eSDag-Erling Smørgrav 		/* When not at the end of data, the line ending char ('\r' or
153*59c8e88eSDag-Erling Smørgrav 		 * '\n') must follow */
154*59c8e88eSDag-Erling Smørgrav 		if (line_end < end && *line_end == '\r')
155*59c8e88eSDag-Erling Smørgrav 			line_end++;
156*59c8e88eSDag-Erling Smørgrav 		if (line_end < end && *line_end == '\n')
157*59c8e88eSDag-Erling Smørgrav 			line_end++;
158*59c8e88eSDag-Erling Smørgrav 
159*59c8e88eSDag-Erling Smørgrav 		/* Record the found line as diff atom */
160*59c8e88eSDag-Erling Smørgrav 		struct diff_atom *atom;
161*59c8e88eSDag-Erling Smørgrav 		ARRAYLIST_ADD(atom, d->atoms);
162*59c8e88eSDag-Erling Smørgrav 		if (!atom)
163*59c8e88eSDag-Erling Smørgrav 			return ENOMEM;
164*59c8e88eSDag-Erling Smørgrav 
165*59c8e88eSDag-Erling Smørgrav 		*atom = (struct diff_atom){
166*59c8e88eSDag-Erling Smørgrav 			.root = d,
167*59c8e88eSDag-Erling Smørgrav 			.pos = (off_t)(pos - d->data),
168*59c8e88eSDag-Erling Smørgrav 			.at = pos,
169*59c8e88eSDag-Erling Smørgrav 			.len = line_end - pos,
170*59c8e88eSDag-Erling Smørgrav 			.hash = hash,
171*59c8e88eSDag-Erling Smørgrav 		};
172*59c8e88eSDag-Erling Smørgrav 
173*59c8e88eSDag-Erling Smørgrav 		/* Starting point for next line: */
174*59c8e88eSDag-Erling Smørgrav 		pos = line_end;
175*59c8e88eSDag-Erling Smørgrav 	}
176*59c8e88eSDag-Erling Smørgrav 
177*59c8e88eSDag-Erling Smørgrav 	/* File are considered binary if they contain embedded '\0' bytes. */
178*59c8e88eSDag-Erling Smørgrav 	if (embedded_nul)
179*59c8e88eSDag-Erling Smørgrav 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
180*59c8e88eSDag-Erling Smørgrav 
181*59c8e88eSDag-Erling Smørgrav 	return DIFF_RC_OK;
182*59c8e88eSDag-Erling Smørgrav }
183*59c8e88eSDag-Erling Smørgrav 
184*59c8e88eSDag-Erling Smørgrav static int
185*59c8e88eSDag-Erling Smørgrav diff_data_atomize_text_lines(struct diff_data *d)
186*59c8e88eSDag-Erling Smørgrav {
187*59c8e88eSDag-Erling Smørgrav 	if (d->data == NULL)
188*59c8e88eSDag-Erling Smørgrav 		return diff_data_atomize_text_lines_fd(d);
189*59c8e88eSDag-Erling Smørgrav 	else
190*59c8e88eSDag-Erling Smørgrav 		return diff_data_atomize_text_lines_mmap(d);
191*59c8e88eSDag-Erling Smørgrav }
192*59c8e88eSDag-Erling Smørgrav 
193*59c8e88eSDag-Erling Smørgrav int
194*59c8e88eSDag-Erling Smørgrav diff_atomize_text_by_line(void *func_data, struct diff_data *d)
195*59c8e88eSDag-Erling Smørgrav {
196*59c8e88eSDag-Erling Smørgrav 	return diff_data_atomize_text_lines(d);
197*59c8e88eSDag-Erling Smørgrav }
198