xref: /freebsd/contrib/libdiff/lib/diff_atomize_text.c (revision e38f2308273c8a51ec45f013d22c963590917cca)
1 /* Split source by line breaks, and calculate a simplistic checksum. */
2 /*
3  * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <errno.h>
19 #include <stdbool.h>
20 #include <stdint.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <unistd.h>
24 #include <ctype.h>
25 
26 #include <arraylist.h>
27 #include <diff_main.h>
28 
29 #include "diff_internal.h"
30 #include "diff_debug.h"
31 
32 unsigned int
33 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
34 {
35 	return hash * 23 + atom_byte;
36 }
37 
38 static int
39 diff_data_atomize_text_lines_fd(struct diff_data *d)
40 {
41 	off_t pos = 0;
42 	const off_t end = pos + d->len;
43 	unsigned int array_size_estimate = d->len / 50;
44 	unsigned int pow2 = 1;
45 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
46 	bool embedded_nul = false;
47 
48 	while (array_size_estimate >>= 1)
49 		pow2++;
50 
51 	ARRAYLIST_INIT(d->atoms, 1 << pow2);
52 
53 	if (fseek(d->root->f, 0L, SEEK_SET) == -1)
54 		return errno;
55 
56 	while (pos < end) {
57 		off_t line_end = pos;
58 		unsigned int hash = 0;
59 		unsigned char buf[512];
60 		size_t r, i;
61 		struct diff_atom *atom;
62 		int eol = 0;
63 
64 		while (eol == 0 && line_end < end) {
65 			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
66 			if (r == 0 && ferror(d->root->f))
67 				return EIO;
68 			i = 0;
69 			while (eol == 0 && i < r) {
70 				if (buf[i] != '\r' && buf[i] != '\n') {
71 					if (!ignore_whitespace
72 					    || !isspace((unsigned char)buf[i]))
73 						hash = diff_atom_hash_update(
74 						    hash, buf[i]);
75 					if (buf[i] == '\0')
76 						embedded_nul = true;
77 					line_end++;
78 				} else
79 					eol = buf[i];
80 				i++;
81 			}
82 		}
83 
84 		/* When not at the end of data, the line ending char ('\r' or
85 		 * '\n') must follow */
86 		if (line_end < end)
87 			line_end++;
88 		/* If that was an '\r', also pull in any following '\n' */
89 		if (line_end < end && eol == '\r') {
90 			if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
91 				return errno;
92 			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
93 			if (r == 0 && ferror(d->root->f))
94 				return EIO;
95 			if (r > 0 && buf[0] == '\n')
96 				line_end++;
97 		}
98 
99 		/* Record the found line as diff atom */
100 		ARRAYLIST_ADD(atom, d->atoms);
101 		if (!atom)
102 			return ENOMEM;
103 
104 		*atom = (struct diff_atom){
105 			.root = d,
106 			.pos = pos,
107 			.at = NULL,	/* atom data is not memory-mapped */
108 			.len = line_end - pos,
109 			.hash = hash,
110 		};
111 
112 		/* Starting point for next line: */
113 		pos = line_end;
114 		if (fseeko(d->root->f, pos, SEEK_SET) == -1)
115 			return errno;
116 	}
117 
118 	/* File are considered binary if they contain embedded '\0' bytes. */
119 	if (embedded_nul)
120 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
121 
122 	return DIFF_RC_OK;
123 }
124 
125 static int
126 diff_data_atomize_text_lines_mmap(struct diff_data *d)
127 {
128 	const uint8_t *pos = d->data;
129 	const uint8_t *end = pos + d->len;
130 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
131 	bool embedded_nul = false;
132 	unsigned int array_size_estimate = d->len / 50;
133 	unsigned int pow2 = 1;
134 	while (array_size_estimate >>= 1)
135 		pow2++;
136 
137 	ARRAYLIST_INIT(d->atoms, 1 << pow2);
138 
139 	while (pos < end) {
140 		const uint8_t *line_end = pos;
141 		unsigned int hash = 0;
142 
143 		while (line_end < end && *line_end != '\r' && *line_end != '\n') {
144 			if (!ignore_whitespace
145 			    || !isspace((unsigned char)*line_end))
146 				hash = diff_atom_hash_update(hash, *line_end);
147 			if (*line_end == '\0')
148 				embedded_nul = true;
149 			line_end++;
150 		}
151 
152 		/* When not at the end of data, the line ending char ('\r' or
153 		 * '\n') must follow */
154 		if (line_end < end && *line_end == '\r')
155 			line_end++;
156 		if (line_end < end && *line_end == '\n')
157 			line_end++;
158 
159 		/* Record the found line as diff atom */
160 		struct diff_atom *atom;
161 		ARRAYLIST_ADD(atom, d->atoms);
162 		if (!atom)
163 			return ENOMEM;
164 
165 		*atom = (struct diff_atom){
166 			.root = d,
167 			.pos = (off_t)(pos - d->data),
168 			.at = pos,
169 			.len = line_end - pos,
170 			.hash = hash,
171 		};
172 
173 		/* Starting point for next line: */
174 		pos = line_end;
175 	}
176 
177 	/* File are considered binary if they contain embedded '\0' bytes. */
178 	if (embedded_nul)
179 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
180 
181 	return DIFF_RC_OK;
182 }
183 
184 static int
185 diff_data_atomize_text_lines(struct diff_data *d)
186 {
187 	if (d->data == NULL)
188 		return diff_data_atomize_text_lines_fd(d);
189 	else
190 		return diff_data_atomize_text_lines_mmap(d);
191 }
192 
193 int
194 diff_atomize_text_by_line(void *func_data, struct diff_data *d)
195 {
196 	return diff_data_atomize_text_lines(d);
197 }
198